Skip to content

Commit e1f0326

Browse files
tangledhelixcpeel
authored andcommitted
lang attribute report
Reports on 'lang' attributes in two modes. In regular mode, counts all lang attributes by language, producing a report of only how many tags have a lang attribute for each language. In verbose mode, print a report three ways: 1. sort by tag, then language 2. sort by tag content 3. sort by lanuage, then content This ports a feature from the legacy 'pptools' software.
1 parent ac56d74 commit e1f0326

1 file changed

Lines changed: 86 additions & 0 deletions

File tree

pphtml.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import json
1414
import os
1515
import sys
16+
from html import unescape
1617
from html.parser import HTMLParser
1718
from time import strftime
1819

@@ -957,6 +958,90 @@ def lang_name(code):
957958
r.append(f" {lang} (☱unknown☷)")
958959
self.apl(r)
959960

961+
def lang_report(self):
962+
"""
963+
Report all elements with an explicit lang attribute, showing
964+
tag name, language code, and text content, sorted three ways.
965+
"""
966+
# Collect (tag, lang, content) tuples for elements with explicit lang attrs.
967+
# Use wbuf (whole file) to handle multi-line elements.
968+
# Skip the <html> tag itself.
969+
items = []
970+
for m in re.finditer(
971+
r'<(?!html\b)(\w+)\b([^>]*?\blang=["\']([^"\']+)["\'][^>]*)>(.*?)</\1>',
972+
self.wbuf,
973+
flags=re.DOTALL,
974+
):
975+
tag = m.group(1)
976+
lang = m.group(3)
977+
raw_content = m.group(4)
978+
# Strip inner HTML tags to get text only
979+
text = re.sub(r'<[^>]+>', '', raw_content)
980+
# Decode HTML entities
981+
text = unescape(text)
982+
# Normalize whitespace
983+
text = re.sub(r'\s+', ' ', text).strip()
984+
if text:
985+
items.append((tag, lang, text))
986+
987+
# Deduplicate while preserving tuple structure
988+
items = sorted(set(items))
989+
990+
r = [f"[info] lang attribute report\n {len(items)} unique elements with 'lang' attribute"]
991+
if not items:
992+
self.apl(r)
993+
return
994+
995+
# In verbose mode, report everything (can be lengthy)
996+
if self.verbose:
997+
# Determine column widths
998+
tag_w = max(len(tag) for tag, _, _ in items)
999+
lang_w = max(len(lang) for _, lang, _ in items)
1000+
1001+
def fmt(tag, lang, text):
1002+
return f" {tag:<{tag_w}} {lang:<{lang_w}} {text}"
1003+
1004+
# Sort 1: by tag, then language, then content (case-insensitive)
1005+
r.append("")
1006+
r.append(" sorted by tag, then language:")
1007+
for tag, lang, text in sorted(
1008+
items, key=lambda x: (x[0].lower(), x[1].lower(), x[2].lower())
1009+
):
1010+
r.append(fmt(tag, lang, text))
1011+
1012+
# Sort 2: by content (case-insensitive)
1013+
r.append("")
1014+
r.append(" sorted by content:")
1015+
for tag, lang, text in sorted(
1016+
items, key=lambda x: x[2].lower()
1017+
):
1018+
r.append(fmt(tag, lang, text))
1019+
1020+
# Sort 3: by language, then content (case-insensitive)
1021+
r.append("")
1022+
r.append(" sorted by language, then content:")
1023+
for tag, lang, text in sorted(
1024+
items, key=lambda x: (x[1].lower(), x[2].lower())
1025+
):
1026+
r.append(fmt(tag, lang, text))
1027+
1028+
# If not verbose, report only a count of tags using each language
1029+
else:
1030+
lang_counts = {}
1031+
for tag, lang, text in sorted(
1032+
items, key=lambda x: (x[1].lower(), x[2].lower())
1033+
):
1034+
if lang in lang_counts:
1035+
lang_counts[lang] += 1
1036+
else:
1037+
lang_counts[lang] = 1
1038+
1039+
for lang in sorted(lang_counts):
1040+
times_label = "times" if lang_counts[lang] > 1 else "time"
1041+
r.append(f" lang={lang} seen {lang_counts[lang]} {times_label}")
1042+
1043+
self.apl(r)
1044+
9601045
def headingOutline(self):
9611046
"""
9621047
show document
@@ -1114,6 +1199,7 @@ def ppvTests(self):
11141199
self.DTDcheck()
11151200
self.altTags()
11161201
self.lang_check()
1202+
self.lang_report()
11171203
self.headingOutline()
11181204

11191205
# --------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)