-
-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy path_libkiwix.py
More file actions
125 lines (102 loc) · 3.82 KB
/
_libkiwix.py
File metadata and controls
125 lines (102 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
r"""[INTERNAL] libkiwix's internal features copies
CAUTION: this is __not__ part of zimscraperlib's API. Don't use outside scraperlib!
Following methods are direct copies of libkiwix's for which there is a need to
in scraperlib. The goal is not to reimplement similar features but to stick as much
as possible to the original source code so that upstream changes can be backported
easily. Hence the unexpected method names and formatting.
https://github.com/kiwix/libkiwix/blob/master/src/reader.cpp
https://github.com/kiwix/libkiwix/blob/master/src/tools/archiveTools.cpp
https://github.com/kiwix/libkiwix/blob/master/src/tools/otherTools.cpp
"""
import io
from typing import NamedTuple
class MimetypeAndCounter(NamedTuple):
mimetype: str
value: int
type CounterMap = dict[str, int]
def getline(src: io.StringIO, delim: str | None = None) -> tuple[bool, str]:
"""C++ stdlib getline() ~clone
Reads `src` until it finds `delim`.
returns whether src is EOF and the extracted string (delim excluded)"""
output = ""
if not delim:
return True, src.read()
char = src.read(1)
while char:
if char == delim:
break
output += char
char = src.read(1)
return char == "", output
def readFullMimetypeAndCounterString(
src: io.StringIO,
) -> tuple[bool, str]:
"""read a single mimetype-and-counter string from source
Returns whether the source is EOF and the extracted string (or empty one)"""
params = ""
eof, mtcStr = getline(src, ";")
if mtcStr.find("=") == -1:
while params.count("=") != 2: # noqa: PLR2004
eof, params = getline(src, ";")
if params.count("=") == 2: # noqa: PLR2004
mtcStr += ";" + params
if eof:
break
return eof, mtcStr
def parseASingleMimetypeCounter(string: str) -> MimetypeAndCounter:
"""MimetypeAndCounter from a single mimetype-and-counter string"""
k: int = string.rfind("=")
if k != len(string) - 1:
mimeType = string[:k]
counter = string[k + 1 :]
try:
return MimetypeAndCounter(mimeType, int(counter))
except ValueError:
pass # value is not castable to int
return MimetypeAndCounter("", 0)
def parseMimetypeCounter(
counterData: str,
) -> CounterMap:
"""Mapping of MIME types with count for each from ZIM Counter metadata string"""
counters: CounterMap = {}
ss = io.StringIO(counterData)
eof = False
while not eof:
eof, mtcStr = readFullMimetypeAndCounterString(ss)
mtc = parseASingleMimetypeCounter(mtcStr)
if mtc.mimetype:
counters.update([mtc])
ss.close()
return counters
def convertTags(tags_str: str) -> list[str]:
"""List of tags expanded with libkiwix's additional hints for pic/vid/det/index"""
tags = tags_str.split(";")
tagsList: list[str] = []
picSeen = vidSeen = detSeen = indexSeen = False
for tag in tags:
# not upstream
if not tag:
continue
picSeen |= tag == "nopic" or tag.startswith("_pictures:")
vidSeen |= tag == "novid" or tag.startswith("_videos:")
detSeen |= tag == "nodet" or tag.startswith("_details:")
indexSeen |= tag.startswith("_ftindex")
if tag == "nopic":
tagsList.append("_pictures:no")
elif tag == "novid":
tagsList.append("_videos:no")
elif tag == "nodet":
tagsList.append("_details:no")
elif tag == "_ftindex":
tagsList.append("_ftindex:yes")
else:
tagsList.append(tag)
if not indexSeen:
tagsList.append("_ftindex:no")
if not picSeen:
tagsList.append("_pictures:yes")
if not vidSeen:
tagsList.append("_videos:yes")
if not detSeen:
tagsList.append("_details:yes")
return tagsList