-
-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathhtml.py
More file actions
736 lines (612 loc) · 24.1 KB
/
html.py
File metadata and controls
736 lines (612 loc) · 24.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
"""HTML Rewriting
This modules contains tools to rewrite HTML retrieved from an online source so that it
can safely operate within a ZIM.
In addition to fixing links so that they point to ZIM item when it exists, it also fixes
a bunch of other tags which needs special handling.
The rewriter needs to have an article url rewriter to rewrite URLs found in HTML, an
optional pre_head and post_head HTML code to insert (typically to load wombat.js in
pre_head and to load additional custom CSS in post_head), and an optional callable that
will be invoked everytime JS code file is encountered (useful to know which JS file is
classic and which is a module).
"""
import io
import re
from collections.abc import Callable
from dataclasses import dataclass
from functools import cache
from html import escape
from html.parser import HTMLParser
from inspect import Signature, signature
from typing import Any, NamedTuple
from bs4 import BeautifulSoup
from zimscraperlib.rewriting.css import CssRewriter
from zimscraperlib.rewriting.js import JsRewriter
from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, ZimPath
AttrNameAndValue = tuple[str, str | None]
AttrsList = list[AttrNameAndValue]
HTTP_EQUIV_REDIRECT_RE = re.compile(
r"^\s*(?P<interval>.*?)\s*;\s*url\s*=\s*(?P<url>.*?)\s*$"
)
class RewritenHtml(NamedTuple):
"""Result of rewrite operation"""
title: str
content: str
def get_attr_value_from(
attrs: AttrsList, name: str, default: str | None = None
) -> str | None:
"""Get one HTML attribute value if present, else return default value"""
for attr_name, value in attrs:
if attr_name == name:
return value
return default
def format_attr(name: str, value: str | None) -> str:
"""Format a given attribute name and value, properly escaping the value"""
if value is None:
return name
html_escaped_value = escape(value, quote=True)
return f'{name}="{html_escaped_value}"'
def get_html_rewrite_context(tag: str, attrs: AttrsList) -> str:
"""Get current HTML rewrite context
By default, rewrite context is the HTML tag. But in some cases (e.g. script tags) we
need to be more precise since rewriting logic will vary based on another attribute
value (e.g. type attribute for script tags)
"""
if tag == "script":
script_type = get_attr_value_from(attrs, "type")
return {
"application/json": "json",
"json": "json",
"module": "js-module",
"application/javascript": "js-classic",
"text/javascript": "js-classic",
"": "js-classic",
}.get(script_type or "", "unknown")
elif tag == "link":
link_rel = get_attr_value_from(attrs, "rel")
if link_rel == "modulepreload":
return "js-module"
elif link_rel == "preload":
preload_type = get_attr_value_from(attrs, "as")
if preload_type == "script":
return "js-classic"
return tag
def extract_base_href(content: str) -> str | None:
"""Extract base href value from HTML content
This is done in a specific function before real parsing / rewriting of any HTML
because we need this information before rewriting any link since we might have stuff
before the <base> tag in html head (e.g. <link> for favicons)
"""
soup = BeautifulSoup(content, features="lxml")
if not soup.head:
return None
for base in soup.head.find_all("base"):
if base.has_attr( # pyright:ignore[reportUnknownMemberType, reportAttributeAccessIssue]
"href"
):
return base[ # pyright:ignore[reportIndexIssue, reportUnknownVariableType, reportArgumentType, reportReturnType]
"href"
]
return None
@cache
def _cached_signature(func: Callable[..., Any]) -> Signature:
"""Returns the signature of a given callable
Result is cached to save performance when reused multiple times
"""
return signature(func)
class HtmlRewriter(HTMLParser):
"""
HTML Rewriter to process HTML code and adapt it to work inside a ZIM.
This class is extensible thanks to the module `rules` which can be used to decorate
any method which will handle some HTML rewriting.
So far, following rules kinds are supported (see HtmlRewritingRules):
drop_attribute (to completely drop an HTML tag attribute), rewrite_attribute (to
modify the value of an HTML tag attribute), rewrite_tag (to modify a whole HTML tag,
typically modifying attributes names and values, rewrite_data (to rewrite the data/
content of an HTML tag)
"""
def __init__(
self,
url_rewriter: ArticleUrlRewriter,
pre_head_insert: str | None,
post_head_insert: str | None,
notify_js_module: Callable[[ZimPath], None] | None,
):
super().__init__(convert_charrefs=False)
self.url_rewriter = url_rewriter
self.title = None
self.output = None
# This works only for tag without children.
# But as we use it to get the title, we are ok
self.html_rewrite_context = None
self.pre_head_insert = pre_head_insert
self.post_head_insert = post_head_insert
self.notify_js_module = notify_js_module
def rewrite(self, content: str) -> RewritenHtml:
"""Rewrite HTML code passed"""
if self.output is not None:
raise Exception("ouput should not already be set") # pragma: no cover
self.output = io.StringIO()
self.base_href = extract_base_href(content)
self.css_rewriter = CssRewriter(self.url_rewriter, self.base_href)
self.js_rewriter = JsRewriter(
url_rewriter=self.url_rewriter,
base_href=self.base_href,
notify_js_module=self.notify_js_module,
)
self.feed(content)
self.close()
output = self.output.getvalue()
self.output = None
return RewritenHtml(self.title or "", output)
def send(self, value: str):
"""Overwrite send from HTMLParser"""
self.output.write(value) # pyright: ignore[reportOptionalMemberAccess]
def handle_starttag(self, tag: str, attrs: AttrsList, *, auto_close: bool = False):
"""Overwrite handle_starttag from HTMLParser"""
self.html_rewrite_context = get_html_rewrite_context(tag=tag, attrs=attrs)
if (
rewritten := rules.do_tag_rewrite(
tag=tag,
attrs=attrs,
url_rewriter=self.url_rewriter,
base_href=self.base_href,
auto_close=auto_close,
)
) is not None:
self.send(rewritten)
return
self.send(f"<{tag}")
if attrs:
self.send(" ")
self.send(
" ".join(
format_attr(*attr)
for attr in (
rules.do_attribute_rewrite(
tag=tag,
attr_name=attr_name,
attr_value=attr_value,
attrs=attrs,
js_rewriter=self.js_rewriter,
css_rewriter=self.css_rewriter,
url_rewriter=self.url_rewriter,
base_href=self.base_href,
notify_js_module=self.notify_js_module,
)
for attr_name, attr_value in attrs
if not rules.do_drop_attribute(
tag=tag, attr_name=attr_name, attr_value=attr_value, attrs=attrs
)
)
)
)
if auto_close:
self.send(" />")
else:
self.send(">")
if tag == "head" and self.pre_head_insert:
self.send(self.pre_head_insert)
def handle_endtag(self, tag: str):
"""Overwrite handle_endtag from HTMLParser"""
self.html_rewrite_context = None
if tag == "head" and self.post_head_insert:
self.send(self.post_head_insert)
self.send(f"</{tag}>")
def handle_startendtag(self, tag: str, attrs: AttrsList):
"""Overwrite handle_startendtag from HTMLParser"""
self.handle_starttag(tag, attrs, auto_close=True)
self.html_rewrite_context = None
def handle_data(self, data: str):
"""Overwrite handle_data from HTMLParser"""
if self.html_rewrite_context == "title" and self.title is None:
self.title = data.strip()
if (
data.strip()
and (
rewritten := rules.do_data_rewrite(
html_rewrite_context=self.html_rewrite_context,
data=data,
css_rewriter=self.css_rewriter,
js_rewriter=self.js_rewriter,
url_rewriter=self.url_rewriter,
)
)
is not None
):
self.send(rewritten)
return
self.send(data)
def handle_entityref(self, name: str):
"""Overwrite handle_entityref from HTMLParser"""
self.send(f"&{name};")
def handle_charref(self, name: str):
"""Overwrite handle_charref from HTMLParser"""
self.send(f"&#{name};")
def handle_comment(self, data: str):
"""Overwrite handle_comment from HTMLParser"""
self.send(f"<!--{data}-->")
def handle_decl(self, decl: str):
"""Overwrite handle_decl from HTMLParser"""
self.send(f"<!{decl}>")
def handle_pi(self, data: str):
"""Overwrite handle_pi from HTMLParser"""
self.send(f"<?{data}>")
def unknown_decl(self, data: str):
"""Overwrite unknown_decl from HTMLParser"""
self.handle_decl(data) # pragma: no cover
DropAttributeCallable = Callable[..., bool]
RewriteAttributeCallable = Callable[..., AttrNameAndValue | None]
RewriteTagCallable = Callable[..., str | None]
RewriteDataCallable = Callable[..., str | None]
@dataclass(frozen=True)
class DropAttributeRule:
"""A rule specifying when an HTML attribute should be dropped"""
func: DropAttributeCallable
@dataclass(frozen=True)
class RewriteAttributeRule:
"""A rule specifying how a given HTML attribute should be rewritten"""
func: RewriteAttributeCallable
@dataclass(frozen=True)
class RewriteTagRule:
"""A rule specifying how a given HTML tag should be rewritten"""
func: RewriteTagCallable
@dataclass(frozen=True)
class RewriteDataRule:
"""A rule specifying how a given HTML data should be rewritten"""
func: RewriteDataCallable
def _check_decorated_func_signature(
expected_func: Callable[..., Any], decorated_func: Callable[..., Any]
):
"""Checks if the decorated function signature is compatible
It checks that decorated function parameters have known names and proper types
"""
expected_params = _cached_signature(expected_func).parameters
func_params = _cached_signature(decorated_func).parameters
for name, param in func_params.items():
if name not in expected_params:
raise TypeError(
f"Parameter '{name}' is unsupported in function "
f"'{decorated_func.__name__}'"
)
if expected_params[name].annotation != param.annotation:
raise TypeError(
f"Parameter '{name}' in function '{decorated_func.__name__}' must be of"
f" type '{expected_params[name].annotation}'"
)
class HTMLRewritingRules:
"""A class holding the definitions of all rules to rewrite HTML documents"""
def __init__(self) -> None:
self.drop_attribute_rules: set[DropAttributeRule] = set()
self.rewrite_attribute_rules: set[RewriteAttributeRule] = set()
self.rewrite_tag_rules: set[RewriteTagRule] = set()
self.rewrite_data_rules: set[RewriteDataRule] = set()
def drop_attribute(
self,
) -> Callable[[DropAttributeCallable], DropAttributeCallable]:
"""Decorator to use when defining a rule regarding attribute dropping"""
def decorator(func: DropAttributeCallable) -> DropAttributeCallable:
_check_decorated_func_signature(self.do_drop_attribute, func)
self.drop_attribute_rules.add(DropAttributeRule(func=func))
return func
return decorator
def rewrite_attribute(
self,
) -> Callable[[RewriteAttributeCallable], RewriteAttributeCallable]:
"""Decorator to use when defining a rule regarding attribute rewriting"""
def decorator(func: RewriteAttributeCallable) -> RewriteAttributeCallable:
_check_decorated_func_signature(self.do_attribute_rewrite, func)
self.rewrite_attribute_rules.add(RewriteAttributeRule(func=func))
return func
return decorator
def rewrite_tag(
self,
) -> Callable[[RewriteTagCallable], RewriteTagCallable]:
"""Decorator to use when defining a rule regarding tag rewriting
This has to be used when we need to rewrite the whole start tag. It can also
handle rewrites of startend tags (autoclosing tags).
"""
def decorator(func: RewriteTagCallable) -> RewriteTagCallable:
_check_decorated_func_signature(self.do_tag_rewrite, func)
self.rewrite_tag_rules.add(RewriteTagRule(func=func))
return func
return decorator
def rewrite_data(
self,
) -> Callable[[RewriteDataCallable], RewriteDataCallable]:
"""Decorator to use when defining a rule regarding data rewriting
This has to be used when we need to rewrite the tag data.
"""
def decorator(func: RewriteDataCallable) -> RewriteDataCallable:
_check_decorated_func_signature(self.do_data_rewrite, func)
self.rewrite_data_rules.add(RewriteDataRule(func=func))
return func
return decorator
def do_drop_attribute(
self, tag: str, attr_name: str, attr_value: str | None, attrs: AttrsList
) -> bool:
"""Utility function to process all attribute dropping rules
Returns true if at least one rule is matching
"""
return any(
rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in { # pyright: ignore[reportUnknownVariableType]
"tag": tag,
"attr_name": attr_name,
"attr_value": attr_value,
"attrs": attrs,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
is True
for rule in self.drop_attribute_rules
)
def do_attribute_rewrite(
self,
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
js_rewriter: JsRewriter,
css_rewriter: CssRewriter,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None] | None,
) -> AttrNameAndValue:
"""Utility function to process all attribute rewriting rules
Returns the rewritten attribute name and value
"""
if attr_value is None:
return attr_name, None
for rule in self.rewrite_attribute_rules:
if (
rewritten := rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in { # pyright: ignore[reportUnknownVariableType]
"tag": tag,
"attr_name": attr_name,
"attr_value": attr_value,
"attrs": attrs,
"js_rewriter": js_rewriter,
"css_rewriter": css_rewriter,
"url_rewriter": url_rewriter,
"base_href": base_href,
"notify_js_module": notify_js_module,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
attr_name, attr_value = rewritten
return attr_name, attr_value
def do_tag_rewrite(
self,
tag: str,
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
*,
auto_close: bool,
) -> str | None:
"""Utility function to process all tag rewriting rules
Returns the rewritten tag
"""
for rule in self.rewrite_tag_rules:
if (
rewritten := rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in { # pyright: ignore[reportUnknownVariableType]
"tag": tag,
"attrs": attrs,
"url_rewriter": url_rewriter,
"base_href": base_href,
"auto_close": auto_close,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
return rewritten
def do_data_rewrite(
self,
html_rewrite_context: str | None,
data: str,
css_rewriter: CssRewriter,
js_rewriter: JsRewriter,
url_rewriter: ArticleUrlRewriter,
) -> str | None:
"""Utility function to process all data rewriting rules
Returns the rewritten data
"""
for rule in self.rewrite_data_rules:
if (
rewritten := rule.func(
**{
arg_name: arg_value
for arg_name, arg_value in { # pyright: ignore[reportUnknownVariableType]
"html_rewrite_context": html_rewrite_context,
"data": data,
"css_rewriter": css_rewriter,
"js_rewriter": js_rewriter,
"url_rewriter": url_rewriter,
}.items()
if arg_name in _cached_signature(rule.func).parameters
}
)
) is not None:
return rewritten
rules = HTMLRewritingRules()
@rules.drop_attribute()
def drop_script_integrity_attribute(tag: str, attr_name: str):
"""Drop integrity attribute in <script> tags"""
return tag == "script" and attr_name == "integrity"
@rules.drop_attribute()
def drop_link_integrity_attribute(tag: str, attr_name: str):
"""Drop integrity attribute in <link> tags"""
return tag == "link" and attr_name == "integrity"
@rules.rewrite_attribute()
def rewrite_meta_charset_content(
tag: str, attr_name: str, attrs: AttrsList
) -> AttrNameAndValue | None:
"""Rewrite charset indicated in meta tag
We need to rewrite both <meta charset='xxx'> and
<meta http-equiv='content-type' content='text/html; charset=xxx'>
"""
if tag != "meta":
return
if attr_name == "charset":
return (attr_name, "UTF-8")
if attr_name == "content" and any( # pragma: no coverage (coverage bug)
attr_name.lower() == "http-equiv"
and attr_value
and attr_value.lower() == "content-type"
for attr_name, attr_value in attrs
):
return (attr_name, "text/html; charset=UTF-8")
@rules.rewrite_attribute()
def rewrite_onxxx_tags(
attr_name: str, attr_value: str | None, js_rewriter: JsRewriter
) -> AttrNameAndValue | None:
"""Rewrite onxxx script attributes"""
if (
attr_value and attr_name.startswith("on") and not attr_name.startswith("on-")
): # pragma: no coverage (coverage bug)
return (attr_name, js_rewriter.rewrite(attr_value))
@rules.rewrite_attribute()
def rewrite_style_tags(
attr_name: str, attr_value: str | None, css_rewriter: CssRewriter
) -> AttrNameAndValue | None:
"""Rewrite style attributes"""
if attr_value and attr_name == "style": # pragma: no coverage (coverage bug)
return (attr_name, css_rewriter.rewrite_inline(attr_value))
@rules.rewrite_attribute()
def rewrite_href_src_attributes(
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
notify_js_module: Callable[[ZimPath], None] | None,
):
"""Rewrite href and src attributes
This is also notifying of any JS script found used as a module, so that this script
is properly rewritten when encountered later on.
"""
if (
attr_name not in ("href", "src")
or not attr_value
or attr_value.startswith("data:")
):
return
if (
notify_js_module
and get_html_rewrite_context(tag=tag, attrs=attrs) == "js-module"
):
notify_js_module(url_rewriter.get_item_path(attr_value, base_href=base_href))
return (
attr_name,
url_rewriter(
attr_value, base_href=base_href, rewrite_all_url=tag != "a"
).rewriten_url,
)
@rules.rewrite_attribute()
def rewrite_srcset_attribute(
attr_name: str,
attr_value: str | None,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
):
"""Rewrite srcset attributes"""
if attr_name != "srcset" or not attr_value:
return
value_list = attr_value.split(",")
new_value_list: list[str] = []
for value in value_list:
url, *other = value.strip().split(" ", maxsplit=1)
new_url = url_rewriter(url, base_href=base_href).rewriten_url
new_value = " ".join([new_url, *other])
new_value_list.append(new_value)
return (attr_name, ", ".join(new_value_list))
@rules.rewrite_tag()
def rewrite_base_tag(tag: str, attrs: AttrsList, *, auto_close: bool):
"""Handle special case of <base> tag which have to be simplified (remove href)
This is special because resulting tag might be empty and hence needs to be dropped
"""
if tag != "base":
return
if get_attr_value_from(attrs, "href") is None:
return # needed so that other rules will be processed as well
values = " ".join(
format_attr(*attr)
for attr in [
(attr_name, attr_value)
for (attr_name, attr_value) in attrs
if attr_name != "href"
]
)
if values:
return f"<base {values}{'/>' if auto_close else '>'}"
else:
return "" # drop whole tag
@rules.rewrite_data()
def rewrite_css_data(
html_rewrite_context: str | None, data: str, css_rewriter: CssRewriter
) -> str | None:
"""Rewrite inline CSS"""
if html_rewrite_context != "style":
return
return css_rewriter.rewrite(data)
@rules.rewrite_data()
def rewrite_json_data(
html_rewrite_context: str | None,
) -> str | None:
"""Rewrite inline JSON"""
if html_rewrite_context != "json":
return
# we do not have any JSON rewriting left ATM since all these rules are applied in
# Browsertrix crawler before storing the WARC record for now
return
@rules.rewrite_data()
def rewrite_js_data(
html_rewrite_context: str | None,
data: str,
js_rewriter: JsRewriter,
) -> str | None:
"""Rewrite inline JS"""
if not (html_rewrite_context and html_rewrite_context.startswith("js-")):
return
return js_rewriter.rewrite(
data,
opts={"isModule": html_rewrite_context == "js-module"},
)
@rules.rewrite_attribute()
def rewrite_meta_http_equiv_redirect(
tag: str,
attr_name: str,
attr_value: str | None,
attrs: AttrsList,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
) -> AttrNameAndValue | None:
"""Rewrite redirect URL in meta http-equiv refresh"""
if tag != "meta":
return
if attr_name != "content":
return
if not attr_value:
return
http_equiv = get_attr_value_from(attrs, "http-equiv")
if http_equiv != "refresh":
return
if (match := HTTP_EQUIV_REDIRECT_RE.match(attr_value)) is None:
return
return (
attr_name,
f"{match['interval']};"
f"url={url_rewriter(match['url'], base_href=base_href).rewriten_url}",
)