IranTransitionProject
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 31 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/docman/backends/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/docman/backends/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/docman/backends/docling_backend.py‎
Lines changed: 23 additions & 17 deletions b/‎src/docman/backends/docling_backend.py‎
Lines changed: 23 additions & 17 deletions
diff --git a/‎src/docman/backends/duckdb_ingest.py‎
Lines changed: 6 additions & 9 deletions b/‎src/docman/backends/duckdb_ingest.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎src/docman/backends/duckdb_query.py‎
Lines changed: 18 additions & 5 deletions b/‎src/docman/backends/duckdb_query.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎src/docman/tools/vector_search.py‎
Lines changed: 8 additions & 2 deletions b/‎src/docman/tools/vector_search.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎tests/test_docling_backend.py‎
Lines changed: 8 additions & 6 deletions b/‎tests/test_docling_backend.py‎
Lines changed: 8 additions & 6 deletions
@@ -23,7 +23,8 @@ jobs:
       # Symlink so [tool.uv.sources] path "../loom" resolves in CI
       - run: ln -s "$GITHUB_WORKSPACE/loom" "$GITHUB_WORKSPACE/../loom"
       - run: uv sync --extra dev
-      - run: uv run ruff check src/
+      - run: uv run ruff check src/ tests/
+      - run: uv run ruff format --check src/ tests/
 
   test:
     runs-on: ubuntu-latest
 
@@ -40,5 +40,36 @@ exclude_lines = [
     "raise NotImplementedError",
 ]
 
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+src = ["src", "tests"]
+
+[tool.ruff.lint]
+select = [
+    "F", "E", "W", "I", "N", "UP", "B", "A", "C4", "SIM",
+    "TC", "RUF", "D", "ANN", "PT", "RET", "ARG", "PL",
+    "PERF", "LOG", "T20",
+]
+ignore = [
+    "D100", "D104", "D203", "D212", "D105", "D107",
+    "ANN401", "ARG002", "PLR0913", "PLR2004",
+    "B008", "PLC0415",
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["D", "ANN", "ARG", "T20", "PLR", "SIM105", "PT011", "B007", "RUF059", "RUF012", "PERF102", "RET504", "N806"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.isort]
+known-first-party = ["docman"]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+line-ending = "auto"
+
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
@@ -1,6 +1,7 @@
 """Docman processing backends for Loom pipeline stages."""
+
 from docman.backends.docling_backend import DoclingBackend
 from docman.backends.duckdb_ingest import DuckDBIngestBackend
 from docman.backends.duckdb_query import DocmanQueryBackend, DuckDBQueryBackend
 
-__all__ = ["DoclingBackend", "DuckDBIngestBackend", "DocmanQueryBackend", "DuckDBQueryBackend"]
+__all__ = ["DoclingBackend", "DocmanQueryBackend", "DuckDBIngestBackend", "DuckDBQueryBackend"]
@@ -25,12 +25,13 @@
     do_ocr:            bool (default: true)
     do_table_structure: bool (default: true)
 
-See also:
+See Also:
     configs/workers/doc_extractor.yaml -- worker config with I/O schemas
     docs/docling-setup.md -- full Docling configuration and tuning guide
     loom.worker.processor.SyncProcessingBackend -- base class for sync backends
     loom.core.workspace.WorkspaceManager -- file-ref resolution with path safety
 """
+
 from __future__ import annotations
 
 import logging
@@ -115,13 +116,11 @@ def process_sync(self, payload: dict[str, Any], config: dict[str, Any]) -> dict[
         except DoclingConversionError:
             raise
         except Exception as exc:
-            raise DoclingConversionError(
-                f"Failed to extract '{file_ref}': {exc}"
-            ) from exc
+            raise DoclingConversionError(f"Failed to extract '{file_ref}': {exc}") from exc
 
         return {"output": result, "model_used": "docling"}
 
-    def _build_converter(self, config: dict[str, Any]):
+    def _build_converter(self, config: dict[str, Any]) -> Any:
         """Build a Docling DocumentConverter with settings from backend_config.
 
         Constructs the converter with accelerator, OCR, and table structure
@@ -140,16 +139,16 @@ def _build_converter(self, config: dict[str, Any]):
             A configured ``docling.document_converter.DocumentConverter``
             instance ready to process PDF and DOCX files.
 
-        See also:
+        See Also:
             docs/docling-setup.md -- full Docling configuration reference.
         """
-        from docling.document_converter import DocumentConverter, PdfFormatOption
+        from docling.datamodel.base_models import InputFormat
         from docling.datamodel.pipeline_options import (
-            PdfPipelineOptions,
             AcceleratorOptions,
+            PdfPipelineOptions,
             TableStructureOptions,
         )
-        from docling.datamodel.base_models import InputFormat
+        from docling.document_converter import DocumentConverter, PdfFormatOption
 
         # --- Accelerator options ---
         device = config.get("device", "auto")
@@ -164,15 +163,20 @@ def _build_converter(self, config: dict[str, Any]):
         do_ocr = config.get("do_ocr", True)
         ocr_options = None
         if do_ocr:
-            ocr_engine = config.get("ocr_engine", "ocrmac" if platform.system() == "Darwin" else "easyocr")
+            ocr_engine = config.get(
+                "ocr_engine", "ocrmac" if platform.system() == "Darwin" else "easyocr"
+            )
             if ocr_engine == "ocrmac":
                 from docling.datamodel.pipeline_options import OcrMacOptions
+
                 ocr_options = OcrMacOptions(recognition="accurate")
             elif ocr_engine == "easyocr":
                 from docling.datamodel.pipeline_options import EasyOcrOptions
+
                 ocr_options = EasyOcrOptions()
             elif ocr_engine == "tesseract":
                 from docling.datamodel.pipeline_options import TesseractOcrOptions
+
                 ocr_options = TesseractOcrOptions()
 
         # --- Table structure ---
@@ -203,7 +207,9 @@ def _build_converter(self, config: dict[str, Any]):
             },
         )
 
-    def _extract(self, source_path: Path, ws: WorkspaceManager, config: dict[str, Any]) -> dict[str, Any]:
+    def _extract(
+        self, source_path: Path, ws: WorkspaceManager, config: dict[str, Any]
+    ) -> dict[str, Any]:
         """Run synchronous Docling extraction.
 
         Docling and its heavy dependencies (torch, torchvision) are imported
@@ -243,15 +249,15 @@ def _extract(self, source_path: Path, ws: WorkspaceManager, config: dict[str, An
 
         # --- Gather structural metadata ---
         # Collect section headers and titles for downstream classification.
-        sections: list[str] = []
-        for item in doc.iterate_items():
-            if hasattr(item, "label") and item.label in ("section_header", "title"):
-                sections.append(item.text if hasattr(item, "text") else str(item))
+        sections: list[str] = [
+            item.text if hasattr(item, "text") else str(item)
+            for item in doc.iterate_items()
+            if hasattr(item, "label") and item.label in ("section_header", "title")
+        ]
 
         # Check whether the document contains any tables.
         has_tables = any(
-            hasattr(item, "label") and item.label == "table"
-            for item in doc.iterate_items()
+            hasattr(item, "label") and item.label == "table" for item in doc.iterate_items()
         )
 
         # Page count -- Docling exposes a .pages list on most document types.
 
@@ -16,23 +16,24 @@
         classification, summary).
 Output: {"document_id": str, "status": "inserted", "source_file": str}
 
-See also:
+See Also:
     configs/workers/doc_ingest.yaml -- worker config with I/O schemas
     src/docman/backends/duckdb_query.py -- query/analytics backend
     loom.worker.processor.SyncProcessingBackend -- base class for sync backends
     loom.core.workspace.WorkspaceManager -- file-ref resolution with path safety
 """
+
 from __future__ import annotations
 
 import asyncio
+import contextlib
 import json
 import logging
 import uuid
 from pathlib import Path
 from typing import Any
 
 import duckdb
-
 from loom.core.workspace import WorkspaceManager
 from loom.worker.processor import BackendError, SyncProcessingBackend
 
@@ -157,9 +158,7 @@ def _read_full_text(self, file_ref: str | None, config: dict[str, Any]) -> str:
             )
             return ""
 
-    def _generate_embedding(
-        self, full_text: str, config: dict[str, Any]
-    ) -> list[float] | None:
+    def _generate_embedding(self, full_text: str, config: dict[str, Any]) -> list[float] | None:
         """Generate a vector embedding for the document text.
 
         Uses the Ollama embedding provider when ``embedding`` config is
@@ -237,16 +236,14 @@ def _ensure_schema(self, conn: duckdb.DuckDBPyConnection) -> None:
         # already exists.
         conn.execute("INSTALL fts")
         conn.execute("LOAD fts")
-        try:
+        with contextlib.suppress(duckdb.Error):
+            # Index may already exist or table may be empty; both are fine.
             conn.execute("""
                 PRAGMA create_fts_index(
                     'documents', 'id', 'full_text', 'summary', 'text_preview',
                     overwrite=1
                 )
             """)
-        except duckdb.Error:
-            # Index may already exist or table may be empty; both are fine.
-            pass
 
         # Create summary view for LLM tool access (excludes full_text).
         conn.execute("""
 
@@ -7,23 +7,36 @@
 Input:  {"action": "search|filter|stats|get|vector_search", ...action-specific params}
 Output: {"results": [...], "total": int} or {"document": {...}}
 
-See also:
+See Also:
     configs/workers/doc_query.yaml -- worker config with I/O schemas
     src/docman/backends/duckdb_ingest.py -- ingestion backend
 """
+
 from __future__ import annotations
 
 from loom.contrib.duckdb import (
     DuckDBQueryBackend as _BaseDuckDBQueryBackend,
+)
+from loom.contrib.duckdb import (
     DuckDBQueryError,
 )
 
 # Columns returned in search/filter results. Excludes full_text to keep
 # NATS messages small — use the "get" action to retrieve full content.
 _RESULT_COLUMNS = [
-    "id", "source_file", "file_ref", "page_count", "has_tables",
-    "sections", "document_type", "classification_confidence",
-    "summary", "key_points", "word_count", "text_preview", "ingested_at",
+    "id",
+    "source_file",
+    "file_ref",
+    "page_count",
+    "has_tables",
+    "sections",
+    "document_type",
+    "classification_confidence",
+    "summary",
+    "key_points",
+    "word_count",
+    "text_preview",
+    "ingested_at",
 ]
 
 
@@ -59,4 +72,4 @@ def __init__(self, db_path: str = "/tmp/docman-workspace/docman.duckdb") -> None
 #   processing_backend: "docman.backends.duckdb_query.DuckDBQueryBackend"
 DuckDBQueryBackend = DocmanQueryBackend
 
-__all__ = ["DuckDBQueryBackend", "DocmanQueryBackend", "DuckDBQueryError"]
+__all__ = ["DocmanQueryBackend", "DuckDBQueryBackend", "DuckDBQueryError"]
@@ -14,14 +14,20 @@
           db_path: "/tmp/docman-workspace/docman.duckdb"
           description: "Find documents semantically similar to a query"
 """
+
 from __future__ import annotations
 
 from loom.contrib.duckdb import DuckDBVectorTool as _BaseDuckDBVectorTool
 
 # Docman-specific columns returned in search results.
 _DOCMAN_RESULT_COLUMNS = [
-    "id", "source_file", "document_type", "summary",
-    "page_count", "has_tables", "ingested_at",
+    "id",
+    "source_file",
+    "document_type",
+    "summary",
+    "page_count",
+    "has_tables",
+    "ingested_at",
 ]
 
 
 
@@ -13,19 +13,20 @@
 directly (no asyncio needed for unit tests). The thread-pool offloading
 is tested via SyncProcessingBackend's own tests in Loom.
 """
-import json
 
-import pytest
+import json
 from unittest.mock import MagicMock, patch
 
-from docman.backends.docling_backend import DoclingBackend, DoclingConversionError
+import pytest
 from loom.worker.processor import BackendError
 
+from docman.backends.docling_backend import DoclingBackend, DoclingConversionError
 
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
 
+
 @pytest.fixture
 def workspace(tmp_path):
     """Provide an isolated temporary workspace directory for each test."""
@@ -42,6 +43,7 @@ def backend(workspace):
 # Input validation tests
 # ---------------------------------------------------------------------------
 
+
 class TestInputValidation:
     """Tests for pre-extraction input validation (no Docling interaction)."""
 
@@ -70,6 +72,7 @@ def test_docling_conversion_error_is_backend_error(self):
 # Happy-path extraction test
 # ---------------------------------------------------------------------------
 
+
 class TestExtraction:
     """Tests that verify the full extraction flow with mocked Docling."""
 
@@ -206,6 +209,7 @@ def test_sections_capped_at_twenty(self, mock_build, backend, workspace):
 # Error handling tests
 # ---------------------------------------------------------------------------
 
+
 class TestErrorHandling:
     """Tests that verify Docling and I/O failures produce DoclingConversionError."""
 
@@ -228,9 +232,7 @@ def test_docling_conversion_failure_raises_conversion_error(
             )
 
     @patch("docman.backends.docling_backend.DoclingBackend._build_converter")
-    def test_write_failure_raises_conversion_error(
-        self, mock_build, backend, workspace
-    ):
+    def test_write_failure_raises_conversion_error(self, mock_build, backend, workspace):
         """When writing the extracted JSON fails (disk full, permissions),
         the backend should wrap the OSError in DoclingConversionError."""
         (workspace / "good.pdf").write_bytes(b"%PDF-1.4 content")