diff --git a/CHANGELOG.md b/CHANGELOG.md index 7da673e..fce799f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,23 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project uses **CalVer `YY.M.PP`** (PEP 440 may normalise patch numbers for the Python wheel — e.g. `26.06.00` → `26.6.0`). +## [26.6.5] - 2026-06-12 + +### Fixed + +- **Classifier-off + no `expected_type` silently produced zero documents.** When + `stages.classifier` was off and a file carried no `expected_type`, the segment + stayed `unmatched` and the file yielded no document — with no error. A single-row + file now defaults to the sole declared `document_type` in that case (mirroring the + single-candidate shortcut the classifier itself takes), so the common "one type, + no classifier" path just works. +- **Request-scope LLM transformation returned empty rows.** The transformer's output + model wrapped each row under a `values` key, but the prompt instructs the model to + emit flat `{field: value}` rows — so the structured output never matched and every + consolidated row came back empty. The output row is now a flat dict, matching the + prompt, so `result.request_transformations` carries populated rows (e.g. a cap + table consolidated across several deeds). + ## [26.6.4] - 2026-06-12 ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 684499b..2c2bdad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "flydocs" # CalVer YY.MM.PP -- bumped per release. Note that PEP 440 normalises # ``26.05.01`` -> ``26.5.1`` in the built wheel filename. -version = "26.6.4" +version = "26.6.5" description = "Pure-multimodal Intelligent Document Processing service: structured fields + bounding boxes, validation, authenticity checks, LLM judge, and a business-rule engine. Sync + queue-backed async APIs over fireflyframework-pyfly and -agentic. Part of Firefly OperationOS, platform-agnostic by design." readme = "README.md" requires-python = ">=3.13" diff --git a/src/flydocs/core/services/pipeline/orchestrator.py b/src/flydocs/core/services/pipeline/orchestrator.py index 241ed4f..45749b9 100644 --- a/src/flydocs/core/services/pipeline/orchestrator.py +++ b/src/flydocs/core/services/pipeline/orchestrator.py @@ -440,6 +440,13 @@ async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any """ request: ExtractionRequest = ctx.metadata["request"] files: list[_FileSlot] = [] + # When the classifier is disabled and the caller pinned no ``expected_type``, + # default a single-row file to the sole declared document type. Without the + # classifier there is no node that assigns a type, so otherwise the segment + # stays ``unmatched`` and the file silently yields no document. Mirrors the + # single-candidate shortcut the classifier step itself takes. + classifier_off = not request.options.stages.classifier + sole_doctype = request.document_types[0].id if len(request.document_types) == 1 else None # Slot index is monotonic across the expansion of all inputs. slot_index = 0 for file in request.files: @@ -452,6 +459,8 @@ async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any multi_row = len(normalised) > 1 for row in normalised: effective_doctype = file.expected_type if not multi_row else None + if effective_doctype is None and not multi_row and classifier_off and sole_doctype: + effective_doctype = sole_doctype slot_filename = ( "/".join((*row.derived_from, row.filename)) if row.derived_from else row.filename ) diff --git a/src/flydocs/core/services/transformations/llm_transformer.py b/src/flydocs/core/services/transformations/llm_transformer.py index 60e1671..94ff456 100644 --- a/src/flydocs/core/services/transformations/llm_transformer.py +++ b/src/flydocs/core/services/transformations/llm_transformer.py @@ -45,16 +45,16 @@ _MAX_OUTPUT_TOKENS = 8192 -class _TransformRow(BaseModel): - """One row returned by the LLM. Free-form key/value dict.""" - - values: dict[str, Any] = Field(default_factory=dict) - - class _TransformOutput(BaseModel): - """LLM response envelope.""" + """LLM response envelope. + + Each row is a flat ``{field_name: value}`` object, exactly as the prompt + instructs the model to emit. (A previous shape wrapped each row under a + ``values`` key, which the prompt never produced — so every row came back + empty. Keeping the row a flat dict here matches the prompt 1:1.) + """ - rows: list[_TransformRow] = Field(default_factory=list) + rows: list[dict[str, Any]] = Field(default_factory=list) class LlmTransformer: @@ -191,7 +191,7 @@ def _serialise_row(row: ExtractedField) -> dict[str, Any]: return out -def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) -> list[ExtractedField]: +def _rebuild_rows(llm_rows: list[dict[str, Any]], template_row: ExtractedField) -> list[ExtractedField]: """Materialise LLM row dicts back into ExtractedField rows. The template row's metadata (bbox, page) is propagated so the @@ -206,7 +206,7 @@ def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) - materialised: list[ExtractedField] = [] for i, lr in enumerate(llm_rows): sub_fields: list[ExtractedField] = [] - for name, value in lr.values.items(): + for name, value in (lr or {}).items(): tmpl = template_by_name.get(name) sub_fields.append( ExtractedField(