diff --git a/pphtml.py b/pphtml.py index 8815aac..f934506 100644 --- a/pphtml.py +++ b/pphtml.py @@ -13,6 +13,7 @@ import json import os import sys +from html import unescape from html.parser import HTMLParser from time import strftime @@ -957,6 +958,90 @@ def lang_name(code): r.append(f" {lang} (☱unknown☷)") self.apl(r) + def lang_report(self): + """ + Report all elements with an explicit lang attribute, showing + tag name, language code, and text content, sorted three ways. + """ + # Collect (tag, lang, content) tuples for elements with explicit lang attrs. + # Use wbuf (whole file) to handle multi-line elements. + # Skip the tag itself. + items = [] + for m in re.finditer( + r'<(?!html\b)(\w+)\b([^>]*?\blang=["\']([^"\']+)["\'][^>]*)>(.*?)', + self.wbuf, + flags=re.DOTALL, + ): + tag = m.group(1) + lang = m.group(3) + raw_content = m.group(4) + # Strip inner HTML tags to get text only + text = re.sub(r'<[^>]+>', '', raw_content) + # Decode HTML entities + text = unescape(text) + # Normalize whitespace + text = re.sub(r'\s+', ' ', text).strip() + if text: + items.append((tag, lang, text)) + + # Deduplicate while preserving tuple structure + items = sorted(set(items)) + + r = [f"[info] lang attribute report\n {len(items)} unique elements with 'lang' attribute"] + if not items: + self.apl(r) + return + + # In verbose mode, report everything (can be lengthy) + if self.verbose: + # Determine column widths + tag_w = max(len(tag) for tag, _, _ in items) + lang_w = max(len(lang) for _, lang, _ in items) + + def fmt(tag, lang, text): + return f" {tag:<{tag_w}} {lang:<{lang_w}} {text}" + + # Sort 1: by tag, then language, then content (case-insensitive) + r.append("") + r.append(" sorted by tag, then language:") + for tag, lang, text in sorted( + items, key=lambda x: (x[0].lower(), x[1].lower(), x[2].lower()) + ): + r.append(fmt(tag, lang, text)) + + # Sort 2: by content (case-insensitive) + r.append("") + r.append(" sorted by content:") + for tag, lang, text in sorted( + items, key=lambda x: x[2].lower() + ): + r.append(fmt(tag, lang, text)) + + # Sort 3: by language, then content (case-insensitive) + r.append("") + r.append(" sorted by language, then content:") + for tag, lang, text in sorted( + items, key=lambda x: (x[1].lower(), x[2].lower()) + ): + r.append(fmt(tag, lang, text)) + + # If not verbose, report only a count of tags using each language + else: + lang_counts = {} + for tag, lang, text in sorted( + items, key=lambda x: (x[1].lower(), x[2].lower()) + ): + if lang in lang_counts: + lang_counts[lang] += 1 + else: + lang_counts[lang] = 1 + + for lang in sorted(lang_counts): + times_label = "times" if lang_counts[lang] > 1 else "time" + r.append(f" lang={lang} seen {lang_counts[lang]} {times_label}") + + self.apl(r) + def headingOutline(self): """ show document @@ -1114,6 +1199,7 @@ def ppvTests(self): self.DTDcheck() self.altTags() self.lang_check() + self.lang_report() self.headingOutline() # --------------------------------------------------------------------------------------