website/convert_articles.py at master · PenumbraOS/website · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python3
"""
Convert clean_articles JSON files to Hugo content Markdown files.
Reads from ../support/clean_articles/ and writes to site/content/knowledgebase/
"""

import json
import os
import re
import html
from pathlib import Path
from datetime import datetime

# Section ID -> human-readable section name mapping
# Derived from the original Zendesk category/section structure
SECTION_MAP = {
    # Power & Charging
    22616709355149: {"name": "Power & Charging", "category": "Ground Control", "weight": 10},
    30079786216589: {"name": "Power & Charging", "category": "Ground Control", "weight": 10},
    24719118093069: {"name": "Power & Charging", "category": "Ground Control", "weight": 10},

    # Features & Usage
    24192735950861: {"name": "Features & Usage", "category": "Ground Control", "weight": 20},
    24192749929997: {"name": "Features & Usage", "category": "Ground Control", "weight": 20},
    22969025694477: {"name": "Features & Usage", "category": "Ground Control", "weight": 20},
    30751243857165: {"name": "Features & Usage", "category": "Ground Control", "weight": 20},

    # Communication & Contacts
    22616723137805: {"name": "Communication & Contacts", "category": "Ground Control", "weight": 30},

    # Media & Data
    25293321386253: {"name": "Media & Data", "category": "Ground Control", "weight": 40},

    # Connectivity & Travel
    22968771221645: {"name": "Connectivity & Travel", "category": "Ground Control", "weight": 50},

    # Account & Billing
    22968808982541: {"name": "Account & Billing", "category": "Ground Control", "weight": 60},
    25050527005709: {"name": "Account & Billing", "category": "Ground Control", "weight": 60},

    # Safety & Recalls
    22968788577549: {"name": "Safety & Recalls", "category": "Ground Control", "weight": 70},

    # Maintenance
    30092452654477: {"name": "Maintenance", "category": "Ground Control", "weight": 80},
}

DEFAULT_SECTION = {"name": "General", "category": "Ground Control", "weight": 90}

# Set of valid article slugs (populated at startup by main())
# Used by clean_html_body() to detect links to unrecovered articles
VALID_SLUGS = set()


def slugify(title):
    """Convert article title to URL-friendly slug."""
    slug = title.lower()
    slug = re.sub(r'[^\w\s-]', '', slug)
    slug = re.sub(r'[\s_]+', '-', slug)
    slug = re.sub(r'-+', '-', slug)
    slug = slug.strip('-')
    return slug


def clean_unicode(text):
    """Clean unrenderable Unicode characters from text.

    - Replaces non-breaking spaces (U+00A0, U+202F) with regular spaces
    - Removes Private Use Area characters (U+E000–U+F8FF) — icon font glyphs
      from Humane's proprietary "AiPin Symbols VF" font that render as blank
      boxes/tofu without the font
    """
    if not text:
        return text
    text = text.replace('\u00A0', ' ')
    text = text.replace('\u202F', ' ')
    text = re.sub(r'[\uE000-\uF8FF]', '', text)
    return text


def clean_html_body(body):
    """Clean up Zendesk-specific HTML in article bodies."""
    if not body:
        return ""

    # Rewrite image URLs to use local media paths
    # Pattern: src="https://support.humane.com/hc/article_attachments/NNNNN"
    body = re.sub(
        r'src="https://support\.humane\.com/hc/article_attachments/(\d+)"',
        r'src="/media/\1.png"',
        body
    )

    # Also handle already-rewritten paths from articles_final
    body = re.sub(
        r'src="media/(\d+)\.\w+"',
        r'src="/media/\1.png"',
        body
    )

    # Rewrite internal article links to use our local paths
    def rewrite_article_link(match):
        article_id = match.group(1)
        slug_part = match.group(2)
        new_slug = slug_part.lower()
        return f'href="/reference/knowledgebase/{new_slug}/"'

    body = re.sub(
        r'href="https://support\.humane\.com/hc/en-us/articles/(\d+)-([^"]+)"',
        rewrite_article_link,
        body
    )

    # Convert links to unrecovered articles into plain text
    # After rewriting, internal links look like: <a href="/knowledgebase/SLUG/">text</a>
    def strip_dead_link(match):
        slug = match.group(1)
        inner_text = match.group(2)
        if slug not in VALID_SLUGS:
            return inner_text  # Just the text, no link
        return match.group(0)  # Keep valid links as-is

    body = re.sub(
        r'<a[^>]*href="/reference/knowledgebase/([^/"]+)/"[^>]*>(.*?)</a>',
        strip_dead_link,
        body,
        flags=re.DOTALL
    )

    # Strip ALL inline style attributes (both populated and empty)
    # These come from Zendesk/Tailwind and are massive bloat
    body = re.sub(r'\s*style="[^"]*"', '', body)

    # Also handle single-quoted style attributes just in case
    body = re.sub(r"\s*style='[^']*'", '', body)

    # Strip data-* attributes
    body = re.sub(r'\s*data-[\w-]+="[^"]*"', '', body)

    # Strip dir attributes
    body = re.sub(r'\s*dir="[^"]*"', '', body)

    # Remove Tailwind/ChatGPT wrapper divs — these are Zendesk chat widget
    # artifacts that wrap normal content in deeply nested divs with Tailwind classes.
    # We strip the opening and closing div tags but keep inner content.
    # Match divs whose class contains Tailwind utility classes
    tailwind_classes = [
        'flex', 'text-message', 'markdown', 'prose', 'dark:prose-invert',
        'items-center', 'gap-', 'rounded-xl', 'empty:hidden', 'break-words',
        'whitespace-normal', 'flex-col', 'flex-grow', 'w-full', 'min-h-',
        'justify-start', 'mb-2', '-ml-2'
    ]
    tailwind_pattern = '|'.join(re.escape(c) for c in tailwind_classes)
    # Remove opening div tags with Tailwind classes
    body = re.sub(
        r'<div\s+class="[^"]*(?:' + tailwind_pattern + r')[^"]*"[^>]*>',
        '',
        body
    )
    # Remove the corresponding closing </div> tags left orphaned.
    # Strategy: count opening <div> vs closing </div> and remove excess closing tags.
    # First, remove any </div> that appears on a line by itself (whitespace only)
    body = re.sub(r'^\s*</div>\s*$', '', body, flags=re.MULTILINE)

    # Clean up excessive whitespace in tags
    body = re.sub(r'\s+>', '>', body)

    # Collapse multiple spaces within tags to single space
    body = re.sub(r'(<[^>]*?)\s{2,}([^>]*?>)', r'\1 \2', body)

    # Clean up excessive blank lines (3+ newlines -> 2)
    body = re.sub(r'\n{3,}', '\n\n', body)

    return body


def extract_text_preview(body, max_length=200):
    """Extract plain text preview from HTML body for description."""
    if not body:
        return ""
    # Strip HTML tags
    text = re.sub(r'<[^>]+>', ' ', body)
    # Decode HTML entities
    text = html.unescape(text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    if len(text) > max_length:
        text = text[:max_length].rsplit(' ', 1)[0] + '...'
    return text


def convert_article(article_path, output_dir):
    """Convert a single article JSON to Hugo markdown."""
    with open(article_path, 'r') as f:
        article = json.load(f)

    article_id = article['id']
    title = article['title']
    body = article.get('body', '')
    created_at = article.get('created_at', '')
    updated_at = article.get('updated_at', '')
    section_id = article.get('section_id', 0)
    original_url = article.get('url', '')
    label_names = article.get('label_names', [])

    section_info = SECTION_MAP.get(section_id, DEFAULT_SECTION)
    section_name = section_info['name']
    category = section_info['category']

    slug = slugify(title)
    body = clean_unicode(body)
    description = extract_text_preview(body)
    cleaned_body = clean_html_body(body)

    # Escape quotes in title for TOML frontmatter
    safe_title = title.replace('"', '\\"')
    safe_description = description.replace('"', '\\"')

    frontmatter = f"""+++
title = "{safe_title}"
date = "{created_at}"
lastmod = "{updated_at}"
slug = "{slug}"
description = "{safe_description}"
original_url = "{original_url}"
article_id = {article_id}
section_id = {section_id}
section_name = "{section_name}"
category = "{category}"
[params]
  section_name = "{section_name}"
  category = "{category}"
  original_url = "{original_url}"
  article_id = {article_id}
+++

"""

    content = frontmatter + cleaned_body + "\n"

    output_path = os.path.join(output_dir, f"{slug}.md")
    with open(output_path, 'w') as f:
        f.write(content)

    print(f"  Created: {slug}.md ({section_name})")
    return slug


def main():
    global VALID_SLUGS

    script_dir = Path(__file__).parent
    clean_articles_dir = script_dir / '..' / 'support' / 'clean_articles'
    output_dir = script_dir / 'content' / 'reference' / 'knowledgebase'

    output_dir.mkdir(parents=True, exist_ok=True)

    json_files = sorted(clean_articles_dir.glob('*.json'))
    print(f"Found {len(json_files)} articles to convert\n")

    # Build the set of valid slugs first (needed for dead link detection)
    for json_file in json_files:
        with open(json_file) as f:
            article = json.load(f)
        VALID_SLUGS.add(slugify(article['title']))

    sections_seen = {}
    for json_file in json_files:
        slug = convert_article(json_file, output_dir)

    print(f"\nDone! Created {len(json_files)} content files in {output_dir}")


if __name__ == '__main__':
    main()