From ab2bafc169e0ebf6eea1532594b73b8a22e352b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20J=C3=BAnio=20de=20Lima=20Liberal?= Date: Mon, 11 May 2026 00:27:38 -0300 Subject: [PATCH 1/6] Fix Windows SIGALRM markdown indexing --- librarian/cli.py | 9 ++- librarian/processing/parsers/base.py | 85 +++++++++++++++++++++------- tests/test_cli.py | 41 ++++++++++++++ tests/test_file_read_errors.py | 28 +++++++++ 4 files changed, 142 insertions(+), 21 deletions(-) create mode 100644 tests/test_cli.py diff --git a/librarian/cli.py b/librarian/cli.py index 17bf92d..5f879b9 100644 --- a/librarian/cli.py +++ b/librarian/cli.py @@ -662,8 +662,13 @@ def add_source( rprint(f" [red]Error:[/red] {err.get('path', '')}: {err.get('error', '')}") skipped = result.get("skipped", 0) + status_line = ( + "[red]Source added, but indexing had errors.[/red]" + if errors + else "[green]Source added and indexed![/green]" + ) summary = ( - f"[green]Source added and indexed![/green]\n\n" + f"{status_line}\n\n" f"Name: [cyan]{source['name']}[/cyan]\n" f"Path: [blue]{source_path}[/blue]\n" f"Files found: [yellow]{len(files_to_index)}[/yellow]\n" @@ -675,6 +680,8 @@ def add_source( summary += f"\nErrors: [red]{len(errors)}[/red]" rprint(Panel(summary, title="Source Added")) + if errors: + raise typer.Exit(1) # ============================================================================= diff --git a/librarian/processing/parsers/base.py b/librarian/processing/parsers/base.py index 2a66675..45e1456 100644 --- a/librarian/processing/parsers/base.py +++ b/librarian/processing/parsers/base.py @@ -8,7 +8,9 @@ import logging import signal from abc import ABC, abstractmethod +from collections.abc import Callable from pathlib import Path +from typing import Any, cast from librarian.types import ParsedDocument @@ -31,6 +33,39 @@ def _timeout_handler(signum: int, frame: object) -> None: raise FileReadTimeoutError("File read timed out") +def _signal_timeout_available() -> bool: + return hasattr(signal, "SIGALRM") and hasattr(signal, "alarm") + + +def _get_signal_timeout_hooks() -> tuple[Any, Callable[[int], object]]: + return vars(signal)["SIGALRM"], cast("Callable[[int], object]", vars(signal)["alarm"]) + + +def _read_text_with_fallback( + file_path: Path, + encoding: str, + fallback_encoding: str | None, +) -> str: + try: + return file_path.read_text(encoding=encoding) + except UnicodeDecodeError: + if fallback_encoding: + return file_path.read_text(encoding=fallback_encoding) + raise + + +def _handle_read_error(file_path: Path, error: Exception) -> None: + if isinstance(error, FileReadTimeoutError): + raise error + if isinstance(error, FileNotFoundError): + raise error + if isinstance(error, PermissionError): + raise FileReadError(f"Permission denied: {file_path}") from error + if isinstance(error, OSError): + raise FileReadError(f"Cannot read {file_path}: {error}") from error + raise error + + def safe_read_text( file_path: Path, timeout: int = DEFAULT_READ_TIMEOUT, @@ -61,21 +96,23 @@ def safe_read_text( msg = f"File not found: {file_path}" raise FileNotFoundError(msg) - old_handler = signal.getsignal(signal.SIGALRM) + if not _signal_timeout_available(): + try: + return _read_text_with_fallback(file_path, encoding, fallback_encoding) + except Exception as e: + _handle_read_error(file_path, e) + raise + + sigalrm, alarm = _get_signal_timeout_hooks() + old_handler = signal.getsignal(sigalrm) content: str | None = None try: - signal.signal(signal.SIGALRM, _timeout_handler) - signal.alarm(timeout) + signal.signal(sigalrm, _timeout_handler) + alarm(timeout) - try: - content = file_path.read_text(encoding=encoding) - except UnicodeDecodeError: - if fallback_encoding: - content = file_path.read_text(encoding=fallback_encoding) - else: - raise - - signal.alarm(0) + content = _read_text_with_fallback(file_path, encoding, fallback_encoding) + + alarm(0) except FileReadTimeoutError as e: raise FileReadTimeoutError( f"Timed out reading {file_path} after {timeout}s " @@ -88,8 +125,8 @@ def safe_read_text( except OSError as e: raise FileReadError(f"Cannot read {file_path}: {e}") from e finally: - signal.alarm(0) - signal.signal(signal.SIGALRM, old_handler) + alarm(0) + signal.signal(sigalrm, old_handler) return content @@ -117,13 +154,21 @@ def safe_read_bytes( msg = f"File not found: {file_path}" raise FileNotFoundError(msg) - old_handler = signal.getsignal(signal.SIGALRM) + if not _signal_timeout_available(): + try: + return file_path.read_bytes() + except Exception as e: + _handle_read_error(file_path, e) + raise + + sigalrm, alarm = _get_signal_timeout_hooks() + old_handler = signal.getsignal(sigalrm) content: bytes | None = None try: - signal.signal(signal.SIGALRM, _timeout_handler) - signal.alarm(timeout) + signal.signal(sigalrm, _timeout_handler) + alarm(timeout) content = file_path.read_bytes() - signal.alarm(0) + alarm(0) except FileReadTimeoutError as e: raise FileReadTimeoutError( f"Timed out reading {file_path} after {timeout}s " @@ -136,8 +181,8 @@ def safe_read_bytes( except OSError as e: raise FileReadError(f"Cannot read {file_path}: {e}") from e finally: - signal.alarm(0) - signal.signal(signal.SIGALRM, old_handler) + alarm(0) + signal.signal(sigalrm, old_handler) return content diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..13f6355 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,41 @@ +"""Tests for CLI behavior.""" + +from pathlib import Path +from typing import Any + +from typer.testing import CliRunner + +from librarian import cli + + +def test_add_directory_exits_nonzero_when_indexing_errors(tmp_path: Path, monkeypatch) -> None: + """Directory add should fail automation when indexing reports errors.""" + docs_dir = tmp_path / "docs" + docs_dir.mkdir() + fixture = docs_dir / "fixture.md" + fixture.write_text("# Synthetic Markdown Fixture\n\nContent.", encoding="utf-8") + + config_dir = tmp_path / "config" + monkeypatch.setattr(cli, "CONFIG_DIR", config_dir) + monkeypatch.setattr(cli, "SOURCES_FILE", config_dir / "sources.json") + monkeypatch.setattr(cli, "SETTINGS_FILE", config_dir / "settings.json") + monkeypatch.setattr(cli, "_get_config", lambda: {"ensure_directories": lambda: None}) + + async def fake_server_ingest(context: Any, directory: str) -> dict[str, Any]: + return { + "directory": directory, + "total_files": 1, + "indexed": 0, + "updated": 0, + "skipped": 0, + "errors": [{"path": str(fixture), "error": "parser exploded"}], + "files": [], + } + + monkeypatch.setattr("librarian.server.index_directory_to_library", fake_server_ingest) + + result = CliRunner().invoke(cli.app, ["add", str(docs_dir), "--verbose"]) + + assert result.exit_code == 1 + assert "Errors:" in result.output + assert "parser exploded" in result.output diff --git a/tests/test_file_read_errors.py b/tests/test_file_read_errors.py index 7f36c35..a7ff613 100644 --- a/tests/test_file_read_errors.py +++ b/tests/test_file_read_errors.py @@ -29,6 +29,20 @@ def test_read_existing_file(self, tmp_path: Path) -> None: content = safe_read_text(test_file) assert content == "hello world" + def test_read_existing_file_without_sigalrm( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test reading text on platforms without POSIX SIGALRM.""" + import signal + + test_file = tmp_path / "test.md" + test_file.write_text("# Hello\nworld", encoding="utf-8") + + monkeypatch.delattr(signal, "SIGALRM", raising=False) + + content = safe_read_text(test_file) + assert content == "# Hello\nworld" + def test_file_not_found(self, tmp_path: Path) -> None: """Test FileNotFoundError for missing files.""" missing = tmp_path / "nonexistent.md" @@ -99,6 +113,20 @@ def test_read_existing_file(self, tmp_path: Path) -> None: content = safe_read_bytes(test_file) assert content == b"\x00\x01\x02" + def test_read_existing_file_without_sigalrm( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Test reading bytes on platforms without POSIX SIGALRM.""" + import signal + + test_file = tmp_path / "test.bin" + test_file.write_bytes(b"\x00\x01\x02") + + monkeypatch.delattr(signal, "SIGALRM", raising=False) + + content = safe_read_bytes(test_file) + assert content == b"\x00\x01\x02" + def test_file_not_found(self, tmp_path: Path) -> None: """Test FileNotFoundError for missing files.""" missing = tmp_path / "nonexistent.bin" From 5931e64247a4512571b2f1766097a8f2ab78f962 Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 11 May 2026 01:06:27 -0300 Subject: [PATCH 2/6] Unify safe_read timeout paths behind a context manager Wrap the POSIX SIGALRM/alarm setup in a single `_read_timeout` context manager that no-ops on platforms (Windows) where those signals are unavailable. This removes the duplicated error-handling branches in `safe_read_text` / `safe_read_bytes` and drops the `_handle_read_error` helper, the unreachable trailing `raise` after it, and the `vars(signal)` + `cast` workarounds previously needed to keep mypy happy on Windows. Also make the new CLI regression test independent of terminal width by pinning COLUMNS so Rich does not wrap the error line off-screen, and annotate `monkeypatch` with `pytest.MonkeyPatch` for consistency with the rest of the suite. Co-Authored-By: Claude Opus 4.7 (1M context) --- librarian/processing/parsers/base.py | 100 ++++++++------------------- tests/test_cli.py | 7 +- 2 files changed, 36 insertions(+), 71 deletions(-) diff --git a/librarian/processing/parsers/base.py b/librarian/processing/parsers/base.py index 45e1456..4f92095 100644 --- a/librarian/processing/parsers/base.py +++ b/librarian/processing/parsers/base.py @@ -8,9 +8,9 @@ import logging import signal from abc import ABC, abstractmethod -from collections.abc import Callable +from collections.abc import Iterator +from contextlib import contextmanager from pathlib import Path -from typing import Any, cast from librarian.types import ParsedDocument @@ -33,37 +33,28 @@ def _timeout_handler(signum: int, frame: object) -> None: raise FileReadTimeoutError("File read timed out") -def _signal_timeout_available() -> bool: - return hasattr(signal, "SIGALRM") and hasattr(signal, "alarm") - - -def _get_signal_timeout_hooks() -> tuple[Any, Callable[[int], object]]: - return vars(signal)["SIGALRM"], cast("Callable[[int], object]", vars(signal)["alarm"]) +@contextmanager +def _read_timeout(timeout: int) -> Iterator[None]: + """Apply a POSIX alarm-based read timeout when available; no-op otherwise. + Windows Python does not expose ``signal.SIGALRM`` / ``signal.alarm``, so we + fall back to an unbounded read on those platforms rather than refusing to + parse the file at all. + """ + sigalrm = getattr(signal, "SIGALRM", None) + alarm = getattr(signal, "alarm", None) + if sigalrm is None or alarm is None: + yield + return -def _read_text_with_fallback( - file_path: Path, - encoding: str, - fallback_encoding: str | None, -) -> str: + old_handler = signal.getsignal(sigalrm) + signal.signal(sigalrm, _timeout_handler) + alarm(timeout) try: - return file_path.read_text(encoding=encoding) - except UnicodeDecodeError: - if fallback_encoding: - return file_path.read_text(encoding=fallback_encoding) - raise - - -def _handle_read_error(file_path: Path, error: Exception) -> None: - if isinstance(error, FileReadTimeoutError): - raise error - if isinstance(error, FileNotFoundError): - raise error - if isinstance(error, PermissionError): - raise FileReadError(f"Permission denied: {file_path}") from error - if isinstance(error, OSError): - raise FileReadError(f"Cannot read {file_path}: {error}") from error - raise error + yield + finally: + alarm(0) + signal.signal(sigalrm, old_handler) def safe_read_text( @@ -96,23 +87,14 @@ def safe_read_text( msg = f"File not found: {file_path}" raise FileNotFoundError(msg) - if not _signal_timeout_available(): - try: - return _read_text_with_fallback(file_path, encoding, fallback_encoding) - except Exception as e: - _handle_read_error(file_path, e) - raise - - sigalrm, alarm = _get_signal_timeout_hooks() - old_handler = signal.getsignal(sigalrm) - content: str | None = None try: - signal.signal(sigalrm, _timeout_handler) - alarm(timeout) - - content = _read_text_with_fallback(file_path, encoding, fallback_encoding) - - alarm(0) + with _read_timeout(timeout): + try: + return file_path.read_text(encoding=encoding) + except UnicodeDecodeError: + if fallback_encoding: + return file_path.read_text(encoding=fallback_encoding) + raise except FileReadTimeoutError as e: raise FileReadTimeoutError( f"Timed out reading {file_path} after {timeout}s " @@ -124,11 +106,6 @@ def safe_read_text( raise FileReadError(f"Permission denied: {file_path}") from e except OSError as e: raise FileReadError(f"Cannot read {file_path}: {e}") from e - finally: - alarm(0) - signal.signal(sigalrm, old_handler) - - return content def safe_read_bytes( @@ -154,21 +131,9 @@ def safe_read_bytes( msg = f"File not found: {file_path}" raise FileNotFoundError(msg) - if not _signal_timeout_available(): - try: - return file_path.read_bytes() - except Exception as e: - _handle_read_error(file_path, e) - raise - - sigalrm, alarm = _get_signal_timeout_hooks() - old_handler = signal.getsignal(sigalrm) - content: bytes | None = None try: - signal.signal(sigalrm, _timeout_handler) - alarm(timeout) - content = file_path.read_bytes() - alarm(0) + with _read_timeout(timeout): + return file_path.read_bytes() except FileReadTimeoutError as e: raise FileReadTimeoutError( f"Timed out reading {file_path} after {timeout}s " @@ -180,11 +145,6 @@ def safe_read_bytes( raise FileReadError(f"Permission denied: {file_path}") from e except OSError as e: raise FileReadError(f"Cannot read {file_path}: {e}") from e - finally: - alarm(0) - signal.signal(sigalrm, old_handler) - - return content class BaseParser(ABC): diff --git a/tests/test_cli.py b/tests/test_cli.py index 13f6355..bd42a9b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,12 +3,15 @@ from pathlib import Path from typing import Any +import pytest from typer.testing import CliRunner from librarian import cli -def test_add_directory_exits_nonzero_when_indexing_errors(tmp_path: Path, monkeypatch) -> None: +def test_add_directory_exits_nonzero_when_indexing_errors( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: """Directory add should fail automation when indexing reports errors.""" docs_dir = tmp_path / "docs" docs_dir.mkdir() @@ -34,6 +37,8 @@ async def fake_server_ingest(context: Any, directory: str) -> dict[str, Any]: monkeypatch.setattr("librarian.server.index_directory_to_library", fake_server_ingest) + # Force a wide terminal so Rich does not wrap the error message off-screen. + monkeypatch.setenv("COLUMNS", "500") result = CliRunner().invoke(cli.app, ["add", str(docs_dir), "--verbose"]) assert result.exit_code == 1 From 3a265993a022af2fc86e4434e622ea7dd5ab7582 Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 11 May 2026 01:14:45 -0300 Subject: [PATCH 3/6] ci: run tests on Windows to catch POSIX-only regressions Add a Windows smoke run (Python 3.11) to the test matrix so that POSIX-only regressions like the SIGALRM markdown-indexing bug fixed in this PR get caught at PR time instead of in production. Ubuntu keeps the full 3.10/3.11/3.12 matrix; Codecov upload stays restricted to the single ubuntu/3.11 job. `fail-fast: false` so a Windows failure does not cancel the Linux matrix. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 816322c..9117e15 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,10 +37,16 @@ jobs: test: name: Tests - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: + os: [ubuntu-latest] python-version: ["3.10", "3.11", "3.12"] + include: + # Smoke-test on Windows to catch POSIX-only regressions (e.g. SIGALRM). + - os: windows-latest + python-version: "3.11" steps: - uses: actions/checkout@v4 @@ -65,7 +71,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 - if: matrix.python-version == '3.11' + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11' with: files: ./coverage.xml fail_ci_if_error: false From 4fee34d52d41fbbc13942558c08b4ae1d0440a8c Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 11 May 2026 12:09:04 -0300 Subject: [PATCH 4/6] ci: add macOS to the test matrix Add a macOS smoke run (Python 3.11) alongside the new Windows job. macOS is a primary developer platform for this project, so catching Darwin-specific quirks (BSD libc differences, case-insensitive filesystem, signal behavior) at PR time is worth the extra runner. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9117e15..afa914d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,6 +47,9 @@ jobs: # Smoke-test on Windows to catch POSIX-only regressions (e.g. SIGALRM). - os: windows-latest python-version: "3.11" + # Smoke-test on macOS — primary developer platform, catches BSD/Darwin quirks. + - os: macos-latest + python-version: "3.11" steps: - uses: actions/checkout@v4 From fb7515ba7d72893be441468360d4ae2ebe58d292 Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 11 May 2026 12:44:33 -0300 Subject: [PATCH 5/6] ci: use uv-managed Python so sqlite-vec loads on macOS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The macOS test job was failing with: AttributeError: 'sqlite3.Connection' object has no attribute 'enable_load_extension' librarian's storage layer calls `conn.enable_load_extension(True)` to load the sqlite-vec extension. That method only exists when CPython was compiled with `--enable-loadable-sqlite-extensions`. The python.org build that `actions/setup-python` ships on macOS is compiled without that flag, so any test path that touches the database fails — 24 unrelated failures, all the same root cause. Switch the test job to uv-managed Python (python-build-standalone), which enables loadable sqlite extensions on every platform. This is a single `python-version` input on `astral-sh/setup-uv`, so it also removes the now-redundant `actions/setup-python` step. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index afa914d..330f010 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,14 +53,14 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install uv + - name: Install uv and Python ${{ matrix.python-version }} uses: astral-sh/setup-uv@v7 with: version: "latest" - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: + # uv-managed Python (python-build-standalone) is compiled with + # --enable-loadable-sqlite-extensions on every platform, which the + # sqlite-vec extension requires. The python.org build used by + # actions/setup-python on macOS does not enable this. python-version: ${{ matrix.python-version }} - name: Install dependencies From 4420171248a7cb2f7ab97bdcc34615bd758719eb Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 11 May 2026 12:48:02 -0300 Subject: [PATCH 6/6] ci: force uv to use python-build-standalone for sqlite-vec The previous attempt removed actions/setup-python but uv still resolved the runner's pre-installed Python (/usr/local/bin/python3.11) because the version matched the matrix entry. That Python is the python.org build, compiled without --enable-loadable-sqlite-extensions, so sqlite-vec failed to load and 24 tests crashed on macOS. Add --python-preference=only-managed to the install step so uv downloads a python-build-standalone interpreter instead. Those builds enable loadable sqlite extensions on every platform. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 330f010..60910e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,14 +57,15 @@ jobs: uses: astral-sh/setup-uv@v7 with: version: "latest" - # uv-managed Python (python-build-standalone) is compiled with - # --enable-loadable-sqlite-extensions on every platform, which the - # sqlite-vec extension requires. The python.org build used by - # actions/setup-python on macOS does not enable this. python-version: ${{ matrix.python-version }} - name: Install dependencies - run: uv sync --extra dev + # Force uv-managed (python-build-standalone) Python. The runner's + # pre-installed Python on macOS is compiled without + # --enable-loadable-sqlite-extensions, so sqlite-vec fails to load. + # python-build-standalone enables loadable sqlite extensions on + # every platform. + run: uv sync --extra dev --python-preference=only-managed - name: Run tests run: uv run pytest -v --cov --cov-config=pyproject.toml --cov-report=xml