-
Notifications
You must be signed in to change notification settings - Fork 140
Expand file tree
/
Copy pathdoc_parser.py
More file actions
147 lines (120 loc) · 4.98 KB
/
doc_parser.py
File metadata and controls
147 lines (120 loc) · 4.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from pathlib import Path
from typing import List, Literal, TypedDict, TypeVar, Union
from openparse import consts, tables, text
from openparse._types import NOT_GIVEN, NotGiven
from openparse.pdf import Pdf
from openparse.processing import (
BasicIngestionPipeline,
IngestionPipeline,
NoOpIngestionPipeline,
)
from openparse.schemas import Node, ParsedDocument, TableElement, TextElement
from openparse.schemas import ImageElement
IngestionPipelineType = TypeVar(
"IngestionPipelineType", bound=IngestionPipeline)
class UnitableArgsDict(TypedDict, total=False):
parsing_algorithm: Literal["unitable"]
min_table_confidence: float
table_output_format: Literal["html"]
class TableTransformersArgsDict(TypedDict, total=False):
parsing_algorithm: Literal["table-transformers"]
min_table_confidence: float
min_cell_confidence: float
table_output_format: Literal["markdown", "html"]
class PyMuPDFArgsDict(TypedDict, total=False):
parsing_algorithm: Literal["pymupdf"]
table_output_format: Literal["markdown", "html"]
class PDfPlumberArgsDict(TypedDict, total=False):
parsing_algorithm: Literal["pdfplumber"]
table_output_format: Literal["markdown", "html"]
def _table_args_dict_to_model(
args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict, PDfPlumberArgsDict],
) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs, tables.PDfPlumberArgs]:
if args_dict["parsing_algorithm"] == "table-transformers":
return tables.TableTransformersArgs(**args_dict)
elif args_dict["parsing_algorithm"] == "pymupdf":
return tables.PyMuPDFArgs(**args_dict)
elif args_dict["parsing_algorithm"] == "unitable":
return tables.UnitableArgs(**args_dict)
elif args_dict["parsing_algorithm"] == "pdfplumber":
return tables.PDfPlumberArgs(**args_dict)
else:
raise ValueError(
f"Unsupported parsing_algorithm: {args_dict['parsing_algorithm']}"
)
class DocumentParser:
"""
A parser for extracting elements from PDF documents, including text and tables.
Attributes:
processing_pipeline (Optional[IngestionPipelineType]): A subclass of IngestionPipeline to process extracted elements.
table_args (Optional[Union[TableTransformersArgsDict, PyMuPDFArgsDict]]): Arguments to customize table parsing.
"""
_verbose: bool = False
def __init__(
self,
*,
processing_pipeline: Union[IngestionPipeline,
NotGiven, None] = NOT_GIVEN,
table_args: Union[
TableTransformersArgsDict, PyMuPDFArgsDict, NotGiven
] = NOT_GIVEN,
):
self.processing_pipeline: IngestionPipeline
if processing_pipeline is NOT_GIVEN:
self.processing_pipeline = BasicIngestionPipeline()
elif processing_pipeline is None:
self.processing_pipeline = NoOpIngestionPipeline()
else:
self.processing_pipeline = processing_pipeline # type: ignore
self.processing_pipeline.verbose = self._verbose
self.table_args = table_args
def parse(
self,
file: Union[str, Path],
ocr: bool = False,
) -> ParsedDocument:
"""
Parse a given document.
Args:
file (Union[str, Path]): The path to the PDF file.
ocr (bool): Whether to use OCR for text extraction. Not recommended unless necessary - inherently slower and less accurate. Note uses PyMuPDF for OCR.
"""
doc = Pdf(file)
text_engine: Literal["pdfminer", "pymupdf"] = (
"pdfminer" if not ocr else "pymupdf"
)
text_elems = text.ingest(doc, parsing_method=text_engine)
text_nodes = self._elems_to_nodes(text_elems)
table_nodes = []
table_args_obj = None
if self.table_args:
table_args_obj = _table_args_dict_to_model(self.table_args)
table_elems = tables.ingest(
doc, table_args_obj, verbose=self._verbose)
table_nodes = self._elems_to_nodes(table_elems)
nodes = text_nodes + table_nodes
nodes = self.processing_pipeline.run(nodes)
parsed_doc = ParsedDocument(
nodes=nodes,
filename=Path(file).name,
num_pages=doc.num_pages,
coordinate_system=consts.COORDINATE_SYSTEM,
table_parsing_kwargs=(
table_args_obj.model_dump() if table_args_obj else None
),
creation_date=doc.file_metadata.get("creation_date"),
last_modified_date=doc.file_metadata.get("last_modified_date"),
last_accessed_date=doc.file_metadata.get("last_accessed_date"),
file_size=doc.file_metadata.get("file_size"),
)
return parsed_doc
@staticmethod
def _elems_to_nodes(
elems: Union[List[TextElement], List[TableElement], List[ImageElement]],
) -> List[Node]:
return [
Node(
elements=(e,),
)
for e in elems
]