Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,23 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project uses **CalVer `YY.M.PP`** (PEP 440 may normalise patch numbers
for the Python wheel — e.g. `26.06.00` → `26.6.0`).

## [26.6.5] - 2026-06-12

### Fixed

- **Classifier-off + no `expected_type` silently produced zero documents.** When
`stages.classifier` was off and a file carried no `expected_type`, the segment
stayed `unmatched` and the file yielded no document — with no error. A single-row
file now defaults to the sole declared `document_type` in that case (mirroring the
single-candidate shortcut the classifier itself takes), so the common "one type,
no classifier" path just works.
- **Request-scope LLM transformation returned empty rows.** The transformer's output
model wrapped each row under a `values` key, but the prompt instructs the model to
emit flat `{field: value}` rows — so the structured output never matched and every
consolidated row came back empty. The output row is now a flat dict, matching the
prompt, so `result.request_transformations` carries populated rows (e.g. a cap
table consolidated across several deeds).

## [26.6.4] - 2026-06-12

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "flydocs"
# CalVer YY.MM.PP -- bumped per release. Note that PEP 440 normalises
# ``26.05.01`` -> ``26.5.1`` in the built wheel filename.
version = "26.6.4"
version = "26.6.5"
description = "Pure-multimodal Intelligent Document Processing service: structured fields + bounding boxes, validation, authenticity checks, LLM judge, and a business-rule engine. Sync + queue-backed async APIs over fireflyframework-pyfly and -agentic. Part of Firefly OperationOS, platform-agnostic by design."
readme = "README.md"
requires-python = ">=3.13"
Expand Down
9 changes: 9 additions & 0 deletions src/flydocs/core/services/pipeline/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,13 @@ async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any
"""
request: ExtractionRequest = ctx.metadata["request"]
files: list[_FileSlot] = []
# When the classifier is disabled and the caller pinned no ``expected_type``,
# default a single-row file to the sole declared document type. Without the
# classifier there is no node that assigns a type, so otherwise the segment
# stays ``unmatched`` and the file silently yields no document. Mirrors the
# single-candidate shortcut the classifier step itself takes.
classifier_off = not request.options.stages.classifier
sole_doctype = request.document_types[0].id if len(request.document_types) == 1 else None
# Slot index is monotonic across the expansion of all inputs.
slot_index = 0
for file in request.files:
Expand All @@ -452,6 +459,8 @@ async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any
multi_row = len(normalised) > 1
for row in normalised:
effective_doctype = file.expected_type if not multi_row else None
if effective_doctype is None and not multi_row and classifier_off and sole_doctype:
effective_doctype = sole_doctype
slot_filename = (
"/".join((*row.derived_from, row.filename)) if row.derived_from else row.filename
)
Expand Down
20 changes: 10 additions & 10 deletions src/flydocs/core/services/transformations/llm_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@
_MAX_OUTPUT_TOKENS = 8192


class _TransformRow(BaseModel):
"""One row returned by the LLM. Free-form key/value dict."""

values: dict[str, Any] = Field(default_factory=dict)


class _TransformOutput(BaseModel):
"""LLM response envelope."""
"""LLM response envelope.

Each row is a flat ``{field_name: value}`` object, exactly as the prompt
instructs the model to emit. (A previous shape wrapped each row under a
``values`` key, which the prompt never produced — so every row came back
empty. Keeping the row a flat dict here matches the prompt 1:1.)
"""

rows: list[_TransformRow] = Field(default_factory=list)
rows: list[dict[str, Any]] = Field(default_factory=list)


class LlmTransformer:
Expand Down Expand Up @@ -191,7 +191,7 @@ def _serialise_row(row: ExtractedField) -> dict[str, Any]:
return out


def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) -> list[ExtractedField]:
def _rebuild_rows(llm_rows: list[dict[str, Any]], template_row: ExtractedField) -> list[ExtractedField]:
"""Materialise LLM row dicts back into ExtractedField rows.

The template row's metadata (bbox, page) is propagated so the
Expand All @@ -206,7 +206,7 @@ def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) -
materialised: list[ExtractedField] = []
for i, lr in enumerate(llm_rows):
sub_fields: list[ExtractedField] = []
for name, value in lr.values.items():
for name, value in (lr or {}).items():
tmpl = template_by_name.get(name)
sub_fields.append(
ExtractedField(
Expand Down
Loading