-
-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathindexing.py
More file actions
125 lines (99 loc) · 3.75 KB
/
indexing.py
File metadata and controls
125 lines (99 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Special item with customized index data and helper classes"""
import io
import pathlib
import libzim.writer # pyright: ignore[reportMissingModuleSource]
try:
import pymupdf # pyright: ignore[reportMissingTypeStubs]
except ImportError: # pragma: no cover
# pymupdf main module was named fitz before 1.24.3
import fitz as pymupdf # pyright: ignore[reportMissingTypeStubs]
from zimscraperlib import logger
class IndexData(libzim.writer.IndexData):
"""IndexData to properly pass indexing title and content to the libzim
Both title and content have to be customized (title can be identical to item title
or not).
keywords is optional since it can be empty
wordcount is optional ; if not passed, it is automaticaly computed from content
"""
def __init__(
self, title: str, content: str, keywords: str = "", wordcount: int | None = None
):
# set wordcount first so that we know if we should override it based on content
self.wordcount = wordcount
self.title = title
self.content = content
self.keywords = keywords
def has_indexdata(self) -> bool:
return len(self.content) > 0 or len(self.title) > 0
def get_title(self) -> str:
return self.title
def get_content(self) -> str:
return self.content
def get_keywords(self) -> str:
return self.keywords
def get_wordcount(self) -> int:
return self.wordcount or 0
@property
def content(self):
return self._content
@content.setter
def content(self, value: str):
self._content = value
if not self.wordcount:
self.wordcount = len(self.content.split()) if self.content else 0
IGNORED_MUPDF_MESSAGES = [
"lcms: not an ICC profile, invalid signature.",
"format error: cmsOpenProfileFromMem failed",
"ignoring broken ICC profile",
]
def get_pdf_index_data(
*,
content: str | bytes | None = None,
fileobj: io.BytesIO | None = None,
filepath: pathlib.Path | None = None,
) -> IndexData:
"""Returns the IndexData information for a given PDF
PDF can be passed either as content or fileobject or filepath
"""
# do not display all pymupdf errors, we will filter them afterwards
pymupdf.TOOLS.mupdf_display_errors( # pyright: ignore[reportUnknownMemberType]
False
)
if content:
doc = pymupdf.open(stream=content)
elif fileobj:
doc = pymupdf.open(stream=fileobj)
else:
doc = pymupdf.open(filename=filepath)
metadata = ( # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
doc.metadata
)
title = ""
if metadata: # pragma: no branch (always metadata in test PDFs)
parts: list[str] = []
for key in ["title", "author", "subject"]:
if metadata.get(key): # pyright: ignore[reportUnknownMemberType]
parts.append(
metadata[key] # pyright: ignore[reportUnknownArgumentType]
)
if parts: # pragma: no branch (always metadata in test PDFs)
title = " - ".join(parts)
content = "\n".join(
page.get_text() # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType, reportAttributeAccessIssue]
for page in doc
)
# build list of messages and filter messages which are known to not be relevant
# in our use-case
mupdf_messages = "\n".join(
warning
for warning in pymupdf.TOOLS.mupdf_warnings().splitlines()
if warning not in IGNORED_MUPDF_MESSAGES
)
if mupdf_messages:
logger.warning(
f"PyMuPDF issues:\n{mupdf_messages}"
) # pragma: no cover (no known error in test PDFs)
return IndexData(
title=title,
content=content,
)