diff --git a/README.md b/README.md index a0b98a087..3e27dda94 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- - PyMuPDF + + PyMuPDF

@@ -10,17 +10,17 @@ pymupdf%2FPyMuPDF | Trendshift

-[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://pymupdf.readthedocs.io) +[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://pymupdf.readthedocs.io?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=badges&utm_term=docs) [![PyPI Version](https://img.shields.io/pypi/v/pymupdf?color=blue&label=PyPI)](https://pypi.org/project/PyMuPDF/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pymupdf)](https://pypi.org/project/pymupdf/) [![License AGPL](https://img.shields.io/github/license/pymupdf/pymupdf)](https://github.com/pymupdf/PyMuPDF/blob/master/COPYING) [![PyPI Downloads](https://static.pepy.tech/badge/pymupdf/month)](https://pepy.tech/projects/pymupdf) [![Github Stars](https://img.shields.io/github/stars/pymupdf/PyMuPDF?style=social)](https://github.com/pymupdf/PyMuPDF/stargazers) -[![Discord](https://img.shields.io/discord/770681584617652264?color=6A7EC2&logo=discord&logoColor=ffffff)](https://pymupdf.io/discord/artifex/) -[![Forum](https://img.shields.io/badge/Forum-ff6600?logo=python&logoColor=ffffff)](https://forum.mupdf.com/c/general/4) +[![Discord](https://img.shields.io/discord/770681584617652264?color=6A7EC2&logo=discord&logoColor=ffffff)](https://artifex.com/discord/artifex?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=badges&utm_term=discord) +[![Forum](https://img.shields.io/badge/Forum-ff6600?logo=python&logoColor=ffffff)](https://forum.mupdf.com/c/general/4?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=badges&utm_term=forum) [![Twitter](https://img.shields.io/twitter/follow/pymupdf4llm)](https://x.com/pymupdf4llm) [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97_Hugging_Face-007ec6)](https://huggingface.co/artifex-software) -[![Demo](https://img.shields.io/badge/PyMuPDF4LLM-live?badge&label=DEMO&logo=python&logoColor=ffffff)](https://demo.pymupdf.io) +[![Demo](https://img.shields.io/badge/PyMuPDF4LLM-live?badge&label=DEMO&logo=python&logoColor=ffffff)](https://demo.pymupdf.io?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=badges&utm_term=demo) **The PDF engine behind over 50 million monthly downloads, powering AI pipelines worldwide.** @@ -32,10 +32,10 @@ ## Why PyMuPDF? -- **Fast** — powered by [MuPDF](https://mupdf.com/), a best-in-class C rendering engine +- **Fast** — powered by [MuPDF](https://mupdf.com?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=body&utm_term=mupdf), a best-in-class C rendering engine - **Accurate** — pixel-perfect text extraction with font, color, and position metadata - **Versatile** — read, write, annotate, redact, merge, split, and convert documents -- **LLM-ready** — native Markdown output via [PyMuPDF4LLM](https://pypi.org/project/pymupdf4llm/) for RAG and AI pipelines +- **LLM-ready** — native Markdown output via [PyMuPDF4LLM](https://pdf4llm.com?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=body&utm_term=pymupdf4llm) for RAG and AI pipelines - **No mandatory dependencies** — `pip install pymupdf` and you're done --- @@ -273,7 +273,7 @@ print(md) Supports multi-column layouts, natural reading order and page chunking. -[![Demo](https://img.shields.io/badge/Pymupdf4llm-live?style=for-the-badge&label=DEMO&logo=python&logoColor=ffffff)](https://demo.pymupdf.io) +[![Demo](https://img.shields.io/badge/Pymupdf4llm-live?style=for-the-badge&label=DEMO&logo=python&logoColor=ffffff)](https://demo.pymupdf.io?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=body&utm_term=demo) --- @@ -392,7 +392,7 @@ for fmt in ["contract.docx", "data.xlsx", "deck.pptx", "report.hwpx"]: print(page.get_text()) ``` -[Get a trial license key for PyMuPDF Pro](https://pymupdf.pro/try-pro) +[Get a trial license key for PyMuPDF Pro](https://pymupdf.pro/try-pro?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=body&utm_term=pymupdf_pro) **What you can do with Office documents:** @@ -723,15 +723,15 @@ doc.save("output.pdf") ## Documentation -Full installation guide, API reference, cookbook, and tutorial at **[pymupdf.readthedocs.io](https://pymupdf.readthedocs.io)**. +Full installation guide, API reference, cookbook, and tutorial at **[pymupdf.readthedocs.io](https://pymupdf.readthedocs.io?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=docs)**. -- [Installation guide](https://pymupdf.readthedocs.io/en/latest/installation.html) -- [API reference](https://pymupdf.readthedocs.io/en/latest/classes.html) -- [Cookbook](https://pymupdf.readthedocs.io/en/latest/the-basics.html) -- [Tutorial](https://pymupdf.readthedocs.io/en/latest/tutorial.html) -- [Changelog](https://pymupdf.readthedocs.io/en/latest/changes.html) -- [PyMuPDF4LLM docs](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/) -- [PyMuPDF Pro docs](https://pymupdf.readthedocs.io/en/latest/pymupdf-pro/index.html) +- [Installation guide](https://pymupdf.readthedocs.io/en/latest/installation.html?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=installation) +- [API reference](https://pymupdf.readthedocs.io/en/latest/classes.html?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=classes) +- [Cookbook](https://pymupdf.readthedocs.io/en/latest/the-basics.html?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=the_basics) +- [Tutorial](https://pymupdf.readthedocs.io/en/latest/tutorial.html?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=tutorial) +- [Changelog](https://pymupdf.readthedocs.io/en/latest/changes.html?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=changelog) +- [PyMuPDF4LLM docs](https://docs.pdf4llm.com?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=docs) +- [PyMuPDF Pro docs](https://pymupdf.readthedocs.io/en/latest/pymupdf-pro?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=documentation_community&utm_term=pymupdf_pro) --- @@ -741,17 +741,17 @@ Full installation guide, API reference, cookbook, and tutorial at **[pymupdf.rea | Project | Description | |---|---| | [PyMuPDF4LLM](https://github.com/pymupdf/pymupdf4llm) | LLM/RAG-optimised Markdown and JSON extraction | -| [PyMuPDF Pro](https://pymupdf.io/pro) | Adds Office and HWP document support | +| [PyMuPDF Pro](https://pymupdf.io/pro?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=related_projects&utm_term=pymupdf_pro) | Adds Office and HWP document support | | [pymupdf-fonts](https://pypi.org/project/pymupdf-fonts/) | Extended font collection for PyMuPDF text output | --- ## Licensing -PyMuPDF and MuPDF are maintained by [Artifex Software, Inc.](https://artifex.com) +PyMuPDF and MuPDF are maintained by [Artifex Software, Inc.](https://artifex.com?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=footer&utm_term=website) - **Open source** — [GNU AGPL v3](https://www.gnu.org/licenses/agpl-3.0.html). Free for open-source projects. -- **Commercial** — separate commercial licences available from [Artifex](https://artifex.com/licensing) for proprietary applications. +- **Commercial** — separate commercial licences available from [Artifex](https://artifex.com/licensing?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=footer&utm_term=licensing) for proprietary applications. --- @@ -760,7 +760,7 @@ PyMuPDF and MuPDF are maintained by [Artifex Software, Inc.](https://artifex.com Contributions are welcome. Please open an issue before submitting large pull requests. - [Issue tracker](https://github.com/pymupdf/PyMuPDF/issues) -- [Discord community](https://pymupdf.pro/discord/artifex/) +- [Discord community](https://artifex.com/discord/artifex?utm_source=github&utm_medium=referral&utm_campaign=pymupdf_github&utm_content=footer&utm_term=discord) ## ⭐ Support this project diff --git a/docs/pymupdf4llm/api.rst b/docs/pymupdf4llm/api.rst index 8f6789203..6fa6ef87f 100644 --- a/docs/pymupdf4llm/api.rst +++ b/docs/pymupdf4llm/api.rst @@ -152,6 +152,8 @@ The PyMuPDF4LLM API "pos": (start, stop), # 0-based integers: bbox_text = chunk["text"][start:stop] } + See: :ref:`box classes ` + :arg float page_height: specify a desired page height. For relevance see the `page_width` parameter. If using the default `None`, the document will appear as one large page with a width of `page_width`. Consequently in this case, no markdown page separators will occur (except the final one), respectively only one page chunk will be returned. :arg bool page_separators: if ``True`` inserts a string ``--- end of page=n ---`` at the end of each page output. Intended for debugging purposes. The page number is 0-based. The separator string is wrapped with line breaks. Default is ``False``. @@ -220,11 +222,13 @@ The PyMuPDF4LLM API "bbox": [x0, y0, x1, y1], # boundary box coordinates "pos": (start, stop), # 0-based integers: bbox_text = chunk["text"][start:stop] } + + See: :ref:`box classes ` .. method:: to_json(doc: pymupdf.Document | str, *, **kwargs) -> str - Parses the document and the specified pages and converts the result into a |JSON|-formatted string. + Parses the document and the specified pages and converts the result into a `JSON formatted string `_. :arg Document,str doc: the file, to be specified either as a file path string, or as a |PyMuPDF| :class:`Document` (created via `pymupdf.open`). In order to use `pathlib.Path` specifications, Python file-like objects, documents in memory etc. you **must** use a |PyMuPDF| :class:`Document`. @@ -246,10 +250,41 @@ The PyMuPDF4LLM API :arg bool embed_images: store image binaries for "picture" boundary boxes. Base64-encoded images are included in the JSON output. Ignores `image_path` if used. This may drastically increase the size of your JSON text. - :arg bool write_images: store image files "picture" boundary boxes.when encountering images, image files will be created from the respective page area and stored in the specified folder. Any text contained in these areas will still be included in the text output. + :arg bool write_images: store image files "picture" boundary boxes. When encountering images, image files will be created from the respective page area and stored in the specified folder. Any text contained in these areas will still be included in the text output. :arg list pages: optional, the pages to consider for output (caution: specify 0-based page numbers). If omitted (`None`) all pages are processed. Specify any valid Python sequence containing integers between `0` and `page_count - 1`. + :rtype: str + + See `JSON Schema `_ for the structure of the output JSON string. + + +.. _pymupdf4llm-api-boxclasses: + +.. note:: + + **About box classes** + + If `page_chunks = True` the return objects for `to_markdown` & `to_text` contains a list of dictionaries representing the layout boundary boxes `page_boxes`, within that a key ``class`` indicates the type of box content therein. + + The return object for `to_json` contains a similar key called ``boxclass``. + + The possible string values are for this ``class`` / ``boxclass`` key are: + + .. code-block:: bash + + text + picture + table + caption + title + section-header + page-header + page-footer + list-item + footnote + formula + .. _pymupdf4llm-api-layout: diff --git a/src/__init__.py b/src/__init__.py index 910862111..bfa801be9 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -18933,7 +18933,7 @@ def JM_choice_options(annot): if n == 0: return # wrong widget type - optarr = mupdf.pdf_dict_get( annot_obj, PDF_NAME('Opt')) + optarr = mupdf.pdf_dict_get_inheritable( annot_obj, PDF_NAME('Opt')) liste = [] for i in range( n): diff --git a/tests/resources/test_4114.pdf b/tests/resources/test_4114.pdf new file mode 100644 index 000000000..a06fdaa10 Binary files /dev/null and b/tests/resources/test_4114.pdf differ diff --git a/tests/test_rewrite_images.py b/tests/test_rewrite_images.py index b3cb290cd..0317b5273 100644 --- a/tests/test_rewrite_images.py +++ b/tests/test_rewrite_images.py @@ -1,5 +1,7 @@ import pymupdf import os +import util + scriptdir = os.path.dirname(__file__) @@ -13,3 +15,25 @@ def test_rewrite_images(): data = doc.tobytes(garbage=3, deflate=True) size1 = len(data) assert (1 - (size1 / size0)) > 0.3 + + +def test_4918(): + ''' + By default this test does nothing, because it requires a rather large input document from: + https://drive.google.com/file/d/1OkIq3XJuKiFfKDWBIcAk8_fLLjpNkuHQ/view?usp=sharing + + It's non-trivial to download from this url, so we only do anything if + environment variable PYMUPDF_TEST_4918_PATH is set to local path of the + input document. + + As of 2026-06-04 this passes with mupdf master, but segvs with current + pymupdf release 1.27.2.3. + ''' + PYMUPDF_TEST_4918_PATH = os.environ.get('PYMUPDF_TEST_4918_PATH') + if not PYMUPDF_TEST_4918_PATH : + print(f'test_4918(): Doing nothing because {PYMUPDF_TEST_4918_PATH=}.') + return + path = PYMUPDF_TEST_4918_PATH + print(f'{path=}') + with pymupdf.open(path) as document: + document.rewrite_images(dpi_threshold=150, dpi_target=100, quality=50) diff --git a/tests/test_widgets.py b/tests/test_widgets.py index 2ba7a0afb..4158f6e18 100644 --- a/tests/test_widgets.py +++ b/tests/test_widgets.py @@ -447,3 +447,21 @@ def test_4965(): print(f' {name=}') print(f' {value=}') print(f' {f_type=}') + + +def test_4114(): + print() + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4114.pdf') + path_out = os.path.normpath(f'{__file__}/../../tests/test_4114_out.pdf') + expected_values = [' - Select One - ', ' ', 'Cincinnati, OH 45999', 'Memphis, TN 37501', 'Ogden, UT 84201', 'Philadelphia, PA 19255'] + expected_values2 = [expected_values, expected_values] + values = list() + with pymupdf.open(path) as document: + for page_i, page in enumerate(document): + for widget in page.widgets(): + if widget.field_type_string == 'ComboBox': + print(f'test_4114(): {page_i=} {widget.choice_values=}') + values.append(widget.choice_values) + widget.update() + document.save(path_out) + assert values == expected_values2