Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.1.8

### Features

- **Add `include_orig_elements` parameter for chunking**: When `True` (the default), the elements used to form each chunk are attached to that chunk's `.metadata.orig_elements` as a gzipped+base64 blob. Set to `False` to omit them and produce a much smaller response payload — useful for large documents with tables, where this blob is duplicated into every chunk.

## 0.1.7

### Security
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.7" # pragma: no cover
__version__ = "0.1.8" # pragma: no cover
5 changes: 5 additions & 0 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def pipeline_api(
new_after_n_chars: Optional[int],
overlap: int,
overlap_all: bool,
include_orig_elements: bool = True,
# ----------------------
filename: str = "",
file_content_type: Optional[str] = None,
Expand Down Expand Up @@ -283,6 +284,7 @@ def pipeline_api(
"new_after_n_chars": new_after_n_chars,
"overlap": overlap,
"overlap_all": overlap_all,
"include_orig_elements": include_orig_elements,
"starting_page_number": starting_page_number,
"include_slide_notes": include_slide_notes,
},
Expand Down Expand Up @@ -338,6 +340,7 @@ def pipeline_api(
"max_characters": max_characters,
"overlap": overlap,
"overlap_all": overlap_all,
"include_orig_elements": include_orig_elements,
"extract_image_block_types": extract_image_block_types,
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
Expand Down Expand Up @@ -368,6 +371,7 @@ def pipeline_api(
"max_characters": max_characters,
"overlap": overlap,
"overlap_all": overlap_all,
"include_orig_elements": include_orig_elements,
"extract_image_block_types": extract_image_block_types,
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
Expand Down Expand Up @@ -716,6 +720,7 @@ def response_generator(is_multipart: bool):
new_after_n_chars=form_params.new_after_n_chars,
overlap=form_params.overlap,
overlap_all=form_params.overlap_all,
include_orig_elements=form_params.include_orig_elements,
starting_page_number=form_params.starting_page_number,
include_slide_notes=form_params.include_slide_notes,
)
Expand Down
14 changes: 14 additions & 0 deletions prepline_general/api/models/form_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class GeneralFormParams(BaseModel):
new_after_n_chars: Optional[int]
overlap: int
overlap_all: bool
include_orig_elements: bool
starting_page_number: Optional[int] = None
include_slide_notes: bool

Expand Down Expand Up @@ -236,6 +237,18 @@ def as_form(
examples=[True],
),
] = False,
include_orig_elements: Annotated[
bool,
Form(
title="Include Orig Elements",
description="""When `True` (the default), the elements used to form each chunk are
added to that chunk's `.metadata.orig_elements` as a gzipped+base64 blob. Set to `False` to omit
them and produce a much smaller payload — useful for large tables, where this blob is duplicated
into every chunk and can balloon the response size dramatically.""",
examples=[False],
),
BeforeValidator(SmartValueParser[bool]().value_or_first_element),
] = True,
starting_page_number: Annotated[
Optional[int],
Form(
Expand Down Expand Up @@ -283,6 +296,7 @@ def as_form(
new_after_n_chars=new_after_n_chars,
overlap=overlap,
overlap_all=overlap_all,
include_orig_elements=include_orig_elements,
unique_element_ids=unique_element_ids,
starting_page_number=starting_page_number,
include_slide_notes=include_slide_notes,
Expand Down