|
13 | 13 | import json |
14 | 14 | import os |
15 | 15 | import sys |
| 16 | +from html import unescape |
16 | 17 | from html.parser import HTMLParser |
17 | 18 | from time import strftime |
18 | 19 |
|
@@ -957,6 +958,90 @@ def lang_name(code): |
957 | 958 | r.append(f" {lang} (☱unknown☷)") |
958 | 959 | self.apl(r) |
959 | 960 |
|
| 961 | + def lang_report(self): |
| 962 | + """ |
| 963 | + Report all elements with an explicit lang attribute, showing |
| 964 | + tag name, language code, and text content, sorted three ways. |
| 965 | + """ |
| 966 | + # Collect (tag, lang, content) tuples for elements with explicit lang attrs. |
| 967 | + # Use wbuf (whole file) to handle multi-line elements. |
| 968 | + # Skip the <html> tag itself. |
| 969 | + items = [] |
| 970 | + for m in re.finditer( |
| 971 | + r'<(?!html\b)(\w+)\b([^>]*?\blang=["\']([^"\']+)["\'][^>]*)>(.*?)</\1>', |
| 972 | + self.wbuf, |
| 973 | + flags=re.DOTALL, |
| 974 | + ): |
| 975 | + tag = m.group(1) |
| 976 | + lang = m.group(3) |
| 977 | + raw_content = m.group(4) |
| 978 | + # Strip inner HTML tags to get text only |
| 979 | + text = re.sub(r'<[^>]+>', '', raw_content) |
| 980 | + # Decode HTML entities |
| 981 | + text = unescape(text) |
| 982 | + # Normalize whitespace |
| 983 | + text = re.sub(r'\s+', ' ', text).strip() |
| 984 | + if text: |
| 985 | + items.append((tag, lang, text)) |
| 986 | + |
| 987 | + # Deduplicate while preserving tuple structure |
| 988 | + items = sorted(set(items)) |
| 989 | + |
| 990 | + r = [f"[info] lang attribute report\n {len(items)} unique elements with 'lang' attribute"] |
| 991 | + if not items: |
| 992 | + self.apl(r) |
| 993 | + return |
| 994 | + |
| 995 | + # In verbose mode, report everything (can be lengthy) |
| 996 | + if self.verbose: |
| 997 | + # Determine column widths |
| 998 | + tag_w = max(len(tag) for tag, _, _ in items) |
| 999 | + lang_w = max(len(lang) for _, lang, _ in items) |
| 1000 | + |
| 1001 | + def fmt(tag, lang, text): |
| 1002 | + return f" {tag:<{tag_w}} {lang:<{lang_w}} {text}" |
| 1003 | + |
| 1004 | + # Sort 1: by tag, then language, then content (case-insensitive) |
| 1005 | + r.append("") |
| 1006 | + r.append(" sorted by tag, then language:") |
| 1007 | + for tag, lang, text in sorted( |
| 1008 | + items, key=lambda x: (x[0].lower(), x[1].lower(), x[2].lower()) |
| 1009 | + ): |
| 1010 | + r.append(fmt(tag, lang, text)) |
| 1011 | + |
| 1012 | + # Sort 2: by content (case-insensitive) |
| 1013 | + r.append("") |
| 1014 | + r.append(" sorted by content:") |
| 1015 | + for tag, lang, text in sorted( |
| 1016 | + items, key=lambda x: x[2].lower() |
| 1017 | + ): |
| 1018 | + r.append(fmt(tag, lang, text)) |
| 1019 | + |
| 1020 | + # Sort 3: by language, then content (case-insensitive) |
| 1021 | + r.append("") |
| 1022 | + r.append(" sorted by language, then content:") |
| 1023 | + for tag, lang, text in sorted( |
| 1024 | + items, key=lambda x: (x[1].lower(), x[2].lower()) |
| 1025 | + ): |
| 1026 | + r.append(fmt(tag, lang, text)) |
| 1027 | + |
| 1028 | + # If not verbose, report only a count of tags using each language |
| 1029 | + else: |
| 1030 | + lang_counts = {} |
| 1031 | + for tag, lang, text in sorted( |
| 1032 | + items, key=lambda x: (x[1].lower(), x[2].lower()) |
| 1033 | + ): |
| 1034 | + if lang in lang_counts: |
| 1035 | + lang_counts[lang] += 1 |
| 1036 | + else: |
| 1037 | + lang_counts[lang] = 1 |
| 1038 | + |
| 1039 | + for lang in sorted(lang_counts): |
| 1040 | + times_label = "times" if lang_counts[lang] > 1 else "time" |
| 1041 | + r.append(f" lang={lang} seen {lang_counts[lang]} {times_label}") |
| 1042 | + |
| 1043 | + self.apl(r) |
| 1044 | + |
960 | 1045 | def headingOutline(self): |
961 | 1046 | """ |
962 | 1047 | show document |
@@ -1114,6 +1199,7 @@ def ppvTests(self): |
1114 | 1199 | self.DTDcheck() |
1115 | 1200 | self.altTags() |
1116 | 1201 | self.lang_check() |
| 1202 | + self.lang_report() |
1117 | 1203 | self.headingOutline() |
1118 | 1204 |
|
1119 | 1205 | # -------------------------------------------------------------------------------------- |
|
0 commit comments