2525 do_ocr: bool (default: true)
2626 do_table_structure: bool (default: true)
2727
28- See also :
28+ See Also :
2929 configs/workers/doc_extractor.yaml -- worker config with I/O schemas
3030 docs/docling-setup.md -- full Docling configuration and tuning guide
3131 loom.worker.processor.SyncProcessingBackend -- base class for sync backends
3232 loom.core.workspace.WorkspaceManager -- file-ref resolution with path safety
3333"""
34+
3435from __future__ import annotations
3536
3637import logging
@@ -115,13 +116,11 @@ def process_sync(self, payload: dict[str, Any], config: dict[str, Any]) -> dict[
115116 except DoclingConversionError :
116117 raise
117118 except Exception as exc :
118- raise DoclingConversionError (
119- f"Failed to extract '{ file_ref } ': { exc } "
120- ) from exc
119+ raise DoclingConversionError (f"Failed to extract '{ file_ref } ': { exc } " ) from exc
121120
122121 return {"output" : result , "model_used" : "docling" }
123122
124- def _build_converter (self , config : dict [str , Any ]):
123+ def _build_converter (self , config : dict [str , Any ]) -> Any :
125124 """Build a Docling DocumentConverter with settings from backend_config.
126125
127126 Constructs the converter with accelerator, OCR, and table structure
@@ -140,16 +139,16 @@ def _build_converter(self, config: dict[str, Any]):
140139 A configured ``docling.document_converter.DocumentConverter``
141140 instance ready to process PDF and DOCX files.
142141
143- See also :
142+ See Also :
144143 docs/docling-setup.md -- full Docling configuration reference.
145144 """
146- from docling .document_converter import DocumentConverter , PdfFormatOption
145+ from docling .datamodel . base_models import InputFormat
147146 from docling .datamodel .pipeline_options import (
148- PdfPipelineOptions ,
149147 AcceleratorOptions ,
148+ PdfPipelineOptions ,
150149 TableStructureOptions ,
151150 )
152- from docling .datamodel . base_models import InputFormat
151+ from docling .document_converter import DocumentConverter , PdfFormatOption
153152
154153 # --- Accelerator options ---
155154 device = config .get ("device" , "auto" )
@@ -164,15 +163,20 @@ def _build_converter(self, config: dict[str, Any]):
164163 do_ocr = config .get ("do_ocr" , True )
165164 ocr_options = None
166165 if do_ocr :
167- ocr_engine = config .get ("ocr_engine" , "ocrmac" if platform .system () == "Darwin" else "easyocr" )
166+ ocr_engine = config .get (
167+ "ocr_engine" , "ocrmac" if platform .system () == "Darwin" else "easyocr"
168+ )
168169 if ocr_engine == "ocrmac" :
169170 from docling .datamodel .pipeline_options import OcrMacOptions
171+
170172 ocr_options = OcrMacOptions (recognition = "accurate" )
171173 elif ocr_engine == "easyocr" :
172174 from docling .datamodel .pipeline_options import EasyOcrOptions
175+
173176 ocr_options = EasyOcrOptions ()
174177 elif ocr_engine == "tesseract" :
175178 from docling .datamodel .pipeline_options import TesseractOcrOptions
179+
176180 ocr_options = TesseractOcrOptions ()
177181
178182 # --- Table structure ---
@@ -203,7 +207,9 @@ def _build_converter(self, config: dict[str, Any]):
203207 },
204208 )
205209
206- def _extract (self , source_path : Path , ws : WorkspaceManager , config : dict [str , Any ]) -> dict [str , Any ]:
210+ def _extract (
211+ self , source_path : Path , ws : WorkspaceManager , config : dict [str , Any ]
212+ ) -> dict [str , Any ]:
207213 """Run synchronous Docling extraction.
208214
209215 Docling and its heavy dependencies (torch, torchvision) are imported
@@ -243,15 +249,15 @@ def _extract(self, source_path: Path, ws: WorkspaceManager, config: dict[str, An
243249
244250 # --- Gather structural metadata ---
245251 # Collect section headers and titles for downstream classification.
246- sections : list [str ] = []
247- for item in doc .iterate_items ():
248- if hasattr (item , "label" ) and item .label in ("section_header" , "title" ):
249- sections .append (item .text if hasattr (item , "text" ) else str (item ))
252+ sections : list [str ] = [
253+ item .text if hasattr (item , "text" ) else str (item )
254+ for item in doc .iterate_items ()
255+ if hasattr (item , "label" ) and item .label in ("section_header" , "title" )
256+ ]
250257
251258 # Check whether the document contains any tables.
252259 has_tables = any (
253- hasattr (item , "label" ) and item .label == "table"
254- for item in doc .iterate_items ()
260+ hasattr (item , "label" ) and item .label == "table" for item in doc .iterate_items ()
255261 )
256262
257263 # Page count -- Docling exposes a .pages list on most document types.
0 commit comments