Skip to content

Commit e71507c

Browse files
hoomanclaude
andcommitted
Add comprehensive ruff linting config and fix all violations
- Added ruff lint config to pyproject.toml matching loom's standards: 20+ rule categories (pydocstyle, annotations, bugbear, pylint, etc.) with Google docstring convention, per-file test relaxations, and isort - Fixed all lint violations across src/ and tests/: import sorting, docstring formatting, contextlib.suppress, list comprehensions, type annotations - Updated CI to also run ruff format --check and lint tests/ - All 63 tests passing, zero lint violations Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 1eb03d1 commit e71507c

14 files changed

Lines changed: 201 additions & 129 deletions

.github/workflows/ci.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ jobs:
2323
# Symlink so [tool.uv.sources] path "../loom" resolves in CI
2424
- run: ln -s "$GITHUB_WORKSPACE/loom" "$GITHUB_WORKSPACE/../loom"
2525
- run: uv sync --extra dev
26-
- run: uv run ruff check src/
26+
- run: uv run ruff check src/ tests/
27+
- run: uv run ruff format --check src/ tests/
2728

2829
test:
2930
runs-on: ubuntu-latest

pyproject.toml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,36 @@ exclude_lines = [
4040
"raise NotImplementedError",
4141
]
4242

43+
[tool.ruff]
44+
line-length = 100
45+
target-version = "py311"
46+
src = ["src", "tests"]
47+
48+
[tool.ruff.lint]
49+
select = [
50+
"F", "E", "W", "I", "N", "UP", "B", "A", "C4", "SIM",
51+
"TC", "RUF", "D", "ANN", "PT", "RET", "ARG", "PL",
52+
"PERF", "LOG", "T20",
53+
]
54+
ignore = [
55+
"D100", "D104", "D203", "D212", "D105", "D107",
56+
"ANN401", "ARG002", "PLR0913", "PLR2004",
57+
"B008", "PLC0415",
58+
]
59+
60+
[tool.ruff.lint.per-file-ignores]
61+
"tests/**/*.py" = ["D", "ANN", "ARG", "T20", "PLR", "SIM105", "PT011", "B007", "RUF059", "RUF012", "PERF102", "RET504", "N806"]
62+
63+
[tool.ruff.lint.pydocstyle]
64+
convention = "google"
65+
66+
[tool.ruff.lint.isort]
67+
known-first-party = ["docman"]
68+
69+
[tool.ruff.format]
70+
quote-style = "double"
71+
indent-style = "space"
72+
line-ending = "auto"
73+
4374
[tool.pytest.ini_options]
4475
asyncio_mode = "auto"

src/docman/backends/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Docman processing backends for Loom pipeline stages."""
2+
23
from docman.backends.docling_backend import DoclingBackend
34
from docman.backends.duckdb_ingest import DuckDBIngestBackend
45
from docman.backends.duckdb_query import DocmanQueryBackend, DuckDBQueryBackend
56

6-
__all__ = ["DoclingBackend", "DuckDBIngestBackend", "DocmanQueryBackend", "DuckDBQueryBackend"]
7+
__all__ = ["DoclingBackend", "DocmanQueryBackend", "DuckDBIngestBackend", "DuckDBQueryBackend"]

src/docman/backends/docling_backend.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,13 @@
2525
do_ocr: bool (default: true)
2626
do_table_structure: bool (default: true)
2727
28-
See also:
28+
See Also:
2929
configs/workers/doc_extractor.yaml -- worker config with I/O schemas
3030
docs/docling-setup.md -- full Docling configuration and tuning guide
3131
loom.worker.processor.SyncProcessingBackend -- base class for sync backends
3232
loom.core.workspace.WorkspaceManager -- file-ref resolution with path safety
3333
"""
34+
3435
from __future__ import annotations
3536

3637
import logging
@@ -115,13 +116,11 @@ def process_sync(self, payload: dict[str, Any], config: dict[str, Any]) -> dict[
115116
except DoclingConversionError:
116117
raise
117118
except Exception as exc:
118-
raise DoclingConversionError(
119-
f"Failed to extract '{file_ref}': {exc}"
120-
) from exc
119+
raise DoclingConversionError(f"Failed to extract '{file_ref}': {exc}") from exc
121120

122121
return {"output": result, "model_used": "docling"}
123122

124-
def _build_converter(self, config: dict[str, Any]):
123+
def _build_converter(self, config: dict[str, Any]) -> Any:
125124
"""Build a Docling DocumentConverter with settings from backend_config.
126125
127126
Constructs the converter with accelerator, OCR, and table structure
@@ -140,16 +139,16 @@ def _build_converter(self, config: dict[str, Any]):
140139
A configured ``docling.document_converter.DocumentConverter``
141140
instance ready to process PDF and DOCX files.
142141
143-
See also:
142+
See Also:
144143
docs/docling-setup.md -- full Docling configuration reference.
145144
"""
146-
from docling.document_converter import DocumentConverter, PdfFormatOption
145+
from docling.datamodel.base_models import InputFormat
147146
from docling.datamodel.pipeline_options import (
148-
PdfPipelineOptions,
149147
AcceleratorOptions,
148+
PdfPipelineOptions,
150149
TableStructureOptions,
151150
)
152-
from docling.datamodel.base_models import InputFormat
151+
from docling.document_converter import DocumentConverter, PdfFormatOption
153152

154153
# --- Accelerator options ---
155154
device = config.get("device", "auto")
@@ -164,15 +163,20 @@ def _build_converter(self, config: dict[str, Any]):
164163
do_ocr = config.get("do_ocr", True)
165164
ocr_options = None
166165
if do_ocr:
167-
ocr_engine = config.get("ocr_engine", "ocrmac" if platform.system() == "Darwin" else "easyocr")
166+
ocr_engine = config.get(
167+
"ocr_engine", "ocrmac" if platform.system() == "Darwin" else "easyocr"
168+
)
168169
if ocr_engine == "ocrmac":
169170
from docling.datamodel.pipeline_options import OcrMacOptions
171+
170172
ocr_options = OcrMacOptions(recognition="accurate")
171173
elif ocr_engine == "easyocr":
172174
from docling.datamodel.pipeline_options import EasyOcrOptions
175+
173176
ocr_options = EasyOcrOptions()
174177
elif ocr_engine == "tesseract":
175178
from docling.datamodel.pipeline_options import TesseractOcrOptions
179+
176180
ocr_options = TesseractOcrOptions()
177181

178182
# --- Table structure ---
@@ -203,7 +207,9 @@ def _build_converter(self, config: dict[str, Any]):
203207
},
204208
)
205209

206-
def _extract(self, source_path: Path, ws: WorkspaceManager, config: dict[str, Any]) -> dict[str, Any]:
210+
def _extract(
211+
self, source_path: Path, ws: WorkspaceManager, config: dict[str, Any]
212+
) -> dict[str, Any]:
207213
"""Run synchronous Docling extraction.
208214
209215
Docling and its heavy dependencies (torch, torchvision) are imported
@@ -243,15 +249,15 @@ def _extract(self, source_path: Path, ws: WorkspaceManager, config: dict[str, An
243249

244250
# --- Gather structural metadata ---
245251
# Collect section headers and titles for downstream classification.
246-
sections: list[str] = []
247-
for item in doc.iterate_items():
248-
if hasattr(item, "label") and item.label in ("section_header", "title"):
249-
sections.append(item.text if hasattr(item, "text") else str(item))
252+
sections: list[str] = [
253+
item.text if hasattr(item, "text") else str(item)
254+
for item in doc.iterate_items()
255+
if hasattr(item, "label") and item.label in ("section_header", "title")
256+
]
250257

251258
# Check whether the document contains any tables.
252259
has_tables = any(
253-
hasattr(item, "label") and item.label == "table"
254-
for item in doc.iterate_items()
260+
hasattr(item, "label") and item.label == "table" for item in doc.iterate_items()
255261
)
256262

257263
# Page count -- Docling exposes a .pages list on most document types.

src/docman/backends/duckdb_ingest.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,24 @@
1616
classification, summary).
1717
Output: {"document_id": str, "status": "inserted", "source_file": str}
1818
19-
See also:
19+
See Also:
2020
configs/workers/doc_ingest.yaml -- worker config with I/O schemas
2121
src/docman/backends/duckdb_query.py -- query/analytics backend
2222
loom.worker.processor.SyncProcessingBackend -- base class for sync backends
2323
loom.core.workspace.WorkspaceManager -- file-ref resolution with path safety
2424
"""
25+
2526
from __future__ import annotations
2627

2728
import asyncio
29+
import contextlib
2830
import json
2931
import logging
3032
import uuid
3133
from pathlib import Path
3234
from typing import Any
3335

3436
import duckdb
35-
3637
from loom.core.workspace import WorkspaceManager
3738
from loom.worker.processor import BackendError, SyncProcessingBackend
3839

@@ -157,9 +158,7 @@ def _read_full_text(self, file_ref: str | None, config: dict[str, Any]) -> str:
157158
)
158159
return ""
159160

160-
def _generate_embedding(
161-
self, full_text: str, config: dict[str, Any]
162-
) -> list[float] | None:
161+
def _generate_embedding(self, full_text: str, config: dict[str, Any]) -> list[float] | None:
163162
"""Generate a vector embedding for the document text.
164163
165164
Uses the Ollama embedding provider when ``embedding`` config is
@@ -237,16 +236,14 @@ def _ensure_schema(self, conn: duckdb.DuckDBPyConnection) -> None:
237236
# already exists.
238237
conn.execute("INSTALL fts")
239238
conn.execute("LOAD fts")
240-
try:
239+
with contextlib.suppress(duckdb.Error):
240+
# Index may already exist or table may be empty; both are fine.
241241
conn.execute("""
242242
PRAGMA create_fts_index(
243243
'documents', 'id', 'full_text', 'summary', 'text_preview',
244244
overwrite=1
245245
)
246246
""")
247-
except duckdb.Error:
248-
# Index may already exist or table may be empty; both are fine.
249-
pass
250247

251248
# Create summary view for LLM tool access (excludes full_text).
252249
conn.execute("""

src/docman/backends/duckdb_query.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,36 @@
77
Input: {"action": "search|filter|stats|get|vector_search", ...action-specific params}
88
Output: {"results": [...], "total": int} or {"document": {...}}
99
10-
See also:
10+
See Also:
1111
configs/workers/doc_query.yaml -- worker config with I/O schemas
1212
src/docman/backends/duckdb_ingest.py -- ingestion backend
1313
"""
14+
1415
from __future__ import annotations
1516

1617
from loom.contrib.duckdb import (
1718
DuckDBQueryBackend as _BaseDuckDBQueryBackend,
19+
)
20+
from loom.contrib.duckdb import (
1821
DuckDBQueryError,
1922
)
2023

2124
# Columns returned in search/filter results. Excludes full_text to keep
2225
# NATS messages small — use the "get" action to retrieve full content.
2326
_RESULT_COLUMNS = [
24-
"id", "source_file", "file_ref", "page_count", "has_tables",
25-
"sections", "document_type", "classification_confidence",
26-
"summary", "key_points", "word_count", "text_preview", "ingested_at",
27+
"id",
28+
"source_file",
29+
"file_ref",
30+
"page_count",
31+
"has_tables",
32+
"sections",
33+
"document_type",
34+
"classification_confidence",
35+
"summary",
36+
"key_points",
37+
"word_count",
38+
"text_preview",
39+
"ingested_at",
2740
]
2841

2942

@@ -59,4 +72,4 @@ def __init__(self, db_path: str = "/tmp/docman-workspace/docman.duckdb") -> None
5972
# processing_backend: "docman.backends.duckdb_query.DuckDBQueryBackend"
6073
DuckDBQueryBackend = DocmanQueryBackend
6174

62-
__all__ = ["DuckDBQueryBackend", "DocmanQueryBackend", "DuckDBQueryError"]
75+
__all__ = ["DocmanQueryBackend", "DuckDBQueryBackend", "DuckDBQueryError"]

src/docman/tools/vector_search.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,20 @@
1414
db_path: "/tmp/docman-workspace/docman.duckdb"
1515
description: "Find documents semantically similar to a query"
1616
"""
17+
1718
from __future__ import annotations
1819

1920
from loom.contrib.duckdb import DuckDBVectorTool as _BaseDuckDBVectorTool
2021

2122
# Docman-specific columns returned in search results.
2223
_DOCMAN_RESULT_COLUMNS = [
23-
"id", "source_file", "document_type", "summary",
24-
"page_count", "has_tables", "ingested_at",
24+
"id",
25+
"source_file",
26+
"document_type",
27+
"summary",
28+
"page_count",
29+
"has_tables",
30+
"ingested_at",
2531
]
2632

2733

tests/test_docling_backend.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,20 @@
1313
directly (no asyncio needed for unit tests). The thread-pool offloading
1414
is tested via SyncProcessingBackend's own tests in Loom.
1515
"""
16-
import json
1716

18-
import pytest
17+
import json
1918
from unittest.mock import MagicMock, patch
2019

21-
from docman.backends.docling_backend import DoclingBackend, DoclingConversionError
20+
import pytest
2221
from loom.worker.processor import BackendError
2322

23+
from docman.backends.docling_backend import DoclingBackend, DoclingConversionError
2424

2525
# ---------------------------------------------------------------------------
2626
# Fixtures
2727
# ---------------------------------------------------------------------------
2828

29+
2930
@pytest.fixture
3031
def workspace(tmp_path):
3132
"""Provide an isolated temporary workspace directory for each test."""
@@ -42,6 +43,7 @@ def backend(workspace):
4243
# Input validation tests
4344
# ---------------------------------------------------------------------------
4445

46+
4547
class TestInputValidation:
4648
"""Tests for pre-extraction input validation (no Docling interaction)."""
4749

@@ -70,6 +72,7 @@ def test_docling_conversion_error_is_backend_error(self):
7072
# Happy-path extraction test
7173
# ---------------------------------------------------------------------------
7274

75+
7376
class TestExtraction:
7477
"""Tests that verify the full extraction flow with mocked Docling."""
7578

@@ -206,6 +209,7 @@ def test_sections_capped_at_twenty(self, mock_build, backend, workspace):
206209
# Error handling tests
207210
# ---------------------------------------------------------------------------
208211

212+
209213
class TestErrorHandling:
210214
"""Tests that verify Docling and I/O failures produce DoclingConversionError."""
211215

@@ -228,9 +232,7 @@ def test_docling_conversion_failure_raises_conversion_error(
228232
)
229233

230234
@patch("docman.backends.docling_backend.DoclingBackend._build_converter")
231-
def test_write_failure_raises_conversion_error(
232-
self, mock_build, backend, workspace
233-
):
235+
def test_write_failure_raises_conversion_error(self, mock_build, backend, workspace):
234236
"""When writing the extracted JSON fails (disk full, permissions),
235237
the backend should wrap the OSError in DoclingConversionError."""
236238
(workspace / "good.pdf").write_bytes(b"%PDF-1.4 content")

0 commit comments

Comments
 (0)