|
| 1 | +// .github/scripts/sync-podcast-feed.js |
| 2 | +// Incremental sync of The PowerShell Podcast (Podbean) into content/podcast/. |
| 3 | +// |
| 4 | +// Design: see docs/adr/0003-incremental-podcast-sync.md. |
| 5 | +// The feed currently carries the full archive, but this script is strictly |
| 6 | +// ADD-ONLY: it generates an episode file only when no existing file matches, |
| 7 | +// and never edits or deletes. That makes a full pass over the feed safe and |
| 8 | +// lets the first run backfill the whole gap between the repo and the feed. |
| 9 | +// |
| 10 | +// Idempotency key (in priority order): enclosure URL, then RSS guid, then |
| 11 | +// episode number. The enclosure URL is the universal key — every existing |
| 12 | +// modern file carries it as `podcast_url`. |
| 13 | +// |
| 14 | +// Run `node .github/scripts/sync-podcast-feed.js --dry-run` to preview. |
| 15 | + |
| 16 | +import fetch from 'node-fetch'; |
| 17 | +import fs from 'fs'; |
| 18 | +import path from 'path'; |
| 19 | + |
| 20 | +const FEED_URL = 'https://feed.podbean.com/powershellpodcast/feed.xml'; |
| 21 | +const PODCAST_DIR = path.join('content', 'podcast'); |
| 22 | +const HOST = 'Andrew Pla'; |
| 23 | +const TITLE_PREFIX = 'The PowerShell Podcast '; |
| 24 | +const DRY_RUN = process.argv.includes('--dry-run'); |
| 25 | + |
| 26 | +// Recurring boilerplate links lifted/dropped from episode bodies. A body line |
| 27 | +// (typically a "Resource Links" bullet) is dropped if it contains any of these. |
| 28 | +// Keep this list tight — episode-specific links must survive. youtu.be is |
| 29 | +// dropped because the id is lifted into the `youtube` frontmatter field. |
| 30 | +const BOILERPLATE = [ |
| 31 | + /andrewpla\.tech/i, |
| 32 | + /discord\.gg\/pdq/i, |
| 33 | + /powershellsummit\.org/i, |
| 34 | + /youtu\.be\//i, |
| 35 | +]; |
| 36 | + |
| 37 | +// --- XML helpers ----------------------------------------------------------- |
| 38 | + |
| 39 | +function decodeEntities(str) { |
| 40 | + return str |
| 41 | + .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16))) |
| 42 | + .replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10))) |
| 43 | + .replace(/'/g, "'") |
| 44 | + .replace(/"/g, '"') |
| 45 | + .replace(/</g, '<') |
| 46 | + .replace(/>/g, '>') |
| 47 | + .replace(/ /g, ' ') |
| 48 | + .replace(/&/g, '&'); |
| 49 | +} |
| 50 | + |
| 51 | +// Read the text of a single child tag from an <item> block, unwrapping CDATA. |
| 52 | +function tag(item, name) { |
| 53 | + const m = item.match(new RegExp(`<${name}(?:\\s[^>]*)?>([\\s\\S]*?)</${name}>`, 'i')); |
| 54 | + if (!m) return ''; |
| 55 | + let v = m[1].trim(); |
| 56 | + const cdata = v.match(/^<!\[CDATA\[([\s\S]*?)\]\]>$/); |
| 57 | + if (cdata) return cdata[1]; |
| 58 | + return decodeEntities(v); |
| 59 | +} |
| 60 | + |
| 61 | +function attr(item, tagName, attrName) { |
| 62 | + const m = item.match(new RegExp(`<${tagName}\\b[^>]*\\b${attrName}=["']([^"']*)["']`, 'i')); |
| 63 | + return m ? m[1] : ''; |
| 64 | +} |
| 65 | + |
| 66 | +// --- Slug + date ----------------------------------------------------------- |
| 67 | + |
| 68 | +// Mirrors Hugo's default slugify closely enough to match the existing modern |
| 69 | +// filenames: lowercase, drop apostrophes, collapse runs of other non-alnum |
| 70 | +// characters to single hyphens, trim hyphens. |
| 71 | +function slugify(s) { |
| 72 | + return s |
| 73 | + .toLowerCase() |
| 74 | + .replace(/['‘’]/g, '') |
| 75 | + .replace(/[^a-z0-9]+/g, '-') |
| 76 | + .replace(/^-+|-+$/g, ''); |
| 77 | +} |
| 78 | + |
| 79 | +function pad2(n) { |
| 80 | + return String(n).padStart(2, '0'); |
| 81 | +} |
| 82 | + |
| 83 | +// pubDate -> { date: 'YYYY-MM-DD', iso: 'YYYY-MM-DDTHH:MM:SS+00:00', y, m } |
| 84 | +function parseDate(pubDate) { |
| 85 | + const d = new Date(pubDate); |
| 86 | + const y = d.getUTCFullYear(); |
| 87 | + const m = pad2(d.getUTCMonth() + 1); |
| 88 | + const day = pad2(d.getUTCDate()); |
| 89 | + const iso = `${y}-${m}-${day}T${pad2(d.getUTCHours())}:${pad2(d.getUTCMinutes())}:${pad2(d.getUTCSeconds())}+00:00`; |
| 90 | + return { date: `${y}-${m}-${day}`, iso, y, m }; |
| 91 | +} |
| 92 | + |
| 93 | +// --- Episode number from enclosure filename -------------------------------- |
| 94 | + |
| 95 | +// The repo's episode-number convention is the number embedded in the Podbean |
| 96 | +// filename (`..._episode_NNN_...`), NOT itunes:episode (which runs one ahead). |
| 97 | +// Most filenames delimit the number (`episode_220_Morten`), but specials glue a |
| 98 | +// random suffix onto it (`episode_2298xv9d`). For the ambiguous case, pick the |
| 99 | +// digit-prefix closest to itunes:episode (the numbers track within ~1). |
| 100 | +function resolveEpisode(enclosureUrl, itunesEp) { |
| 101 | + const clean = enclosureUrl.match(/episode[_-](\d+)[_-]/i); |
| 102 | + if (clean) return parseInt(clean[1], 10); |
| 103 | + |
| 104 | + const greedy = enclosureUrl.match(/episode[_-](\d+)/i); |
| 105 | + if (!greedy) return null; |
| 106 | + const digits = greedy[1]; |
| 107 | + if (!itunesEp) return parseInt(digits, 10); |
| 108 | + |
| 109 | + let best = null; |
| 110 | + for (let len = digits.length; len >= 1; len--) { |
| 111 | + const cand = parseInt(digits.slice(0, len), 10); |
| 112 | + if (Math.abs(cand - itunesEp) <= 2) { best = cand; break; } |
| 113 | + } |
| 114 | + return best != null ? best : parseInt(digits, 10); |
| 115 | +} |
| 116 | + |
| 117 | +// --- HTML body -> markdown ------------------------------------------------- |
| 118 | + |
| 119 | +function htmlToMarkdown(html) { |
| 120 | + let md = html; |
| 121 | + |
| 122 | + // Lists |
| 123 | + md = md.replace(/<\/?ul[^>]*>/gi, '\n'); |
| 124 | + md = md.replace(/<\/?ol[^>]*>/gi, '\n'); |
| 125 | + md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, t) => `- ${t.trim()}\n`); |
| 126 | + |
| 127 | + // Links, emphasis |
| 128 | + md = md.replace(/<a\b[^>]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)'); |
| 129 | + md = md.replace(/<(strong|b)\b[^>]*>([\s\S]*?)<\/\1>/gi, '**$2**'); |
| 130 | + md = md.replace(/<(em|i)\b[^>]*>([\s\S]*?)<\/\1>/gi, '*$2*'); |
| 131 | + |
| 132 | + // Block + break tags |
| 133 | + md = md.replace(/<br\s*\/?>/gi, '\n'); |
| 134 | + md = md.replace(/<\/p>/gi, '\n\n'); |
| 135 | + md = md.replace(/<p[^>]*>/gi, ''); |
| 136 | + |
| 137 | + // Strip anything left, decode, tidy |
| 138 | + md = md.replace(/<\/?[a-zA-Z][^>]*>/g, ''); |
| 139 | + md = decodeEntities(md); |
| 140 | + |
| 141 | + md = md |
| 142 | + .split('\n') |
| 143 | + .map((line) => line.replace(/ /g, ' ').replace(/[ \t]+$/g, '').trim()) |
| 144 | + .join('\n'); |
| 145 | + md = md.replace(/\n{3,}/g, '\n\n'); |
| 146 | + // Tighten lists: drop blank lines between consecutive bullets. |
| 147 | + md = md.replace(/(^- .*)\n\n(?=- )/gm, '$1\n'); |
| 148 | + return md.trim(); |
| 149 | +} |
| 150 | + |
| 151 | +function stripBoilerplate(md) { |
| 152 | + const kept = md |
| 153 | + .split('\n') |
| 154 | + .filter((line) => !BOILERPLATE.some((re) => re.test(line))); |
| 155 | + |
| 156 | + // Drop a trailing "Resource Links" header left empty after stripping. |
| 157 | + while (kept.length) { |
| 158 | + const last = kept[kept.length - 1].trim(); |
| 159 | + if (last === '' || /^resource links?:?$/i.test(last)) kept.pop(); |
| 160 | + else break; |
| 161 | + } |
| 162 | + return kept.join('\n').replace(/\n{3,}/g, '\n\n').trim(); |
| 163 | +} |
| 164 | + |
| 165 | +// --- Conservative guest extraction ----------------------------------------- |
| 166 | +// Ported from scripts/update-podcast-authors.py. High-confidence only; a missed |
| 167 | +// guest degrades to no byline, a wrong guest would be a bad byline (ADR 0003). |
| 168 | + |
| 169 | +const NOT_A_PERSON = /^(?:The|A|An|In|This|For|With|From|On|PowerShell|Microsoft|Windows|Azure|DevOps|Cloud|GitHub|AWS|Episode|Podcast|Session|Roundtable|Community|Tonight|Join|Listen|Watch|Learn|Get|Set|Bar|Summit|Meet|Introducing|Featuring|Welcome|Just)\b/i; |
| 170 | + |
| 171 | +function looksLikePerson(name) { |
| 172 | + if (!name || name.length < 4) return false; |
| 173 | + if (NOT_A_PERSON.test(name)) return false; |
| 174 | + const words = name.split(/\s+/); |
| 175 | + if (words.length < 2) return false; |
| 176 | + if (words.filter((w) => /^[A-ZÀ-ɏ]/.test(w)).length < 2) return false; |
| 177 | + // Reject emphasis words (ALSO, PLUS) but allow initials like "B." |
| 178 | + if (words.some((w) => w.length > 1 && w === w.toUpperCase() && !w.endsWith('.'))) return false; |
| 179 | + const low = name.toLowerCase(); |
| 180 | + return !['http', 'www.', 'episode', 'podcast', 'powershell', 'microsoft', 'summit'].some((b) => low.includes(b)); |
| 181 | +} |
| 182 | + |
| 183 | +function cleanName(raw) { |
| 184 | + return raw |
| 185 | + .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') |
| 186 | + .replace(/<[^>]+>/g, '') |
| 187 | + .replace(/[*_`]/g, '') |
| 188 | + .replace(/^["']|["']$/g, '') |
| 189 | + .replace(/[.,;:!?]+$/g, '') |
| 190 | + .replace(/^(?:MVPs?\s+|Dr\.\s+|Prof\.\s+)/i, '') |
| 191 | + .trim(); |
| 192 | +} |
| 193 | + |
| 194 | +// "...with Name", "...with Name1 and Name2" |
| 195 | +function guestsFromTitle(title) { |
| 196 | + const m = title.match(/\bwith\s+(.+)$/i); |
| 197 | + if (!m) return []; |
| 198 | + let after = m[1] |
| 199 | + .replace(/^(?:MVPs?\s+|special\s+guest\s+host\s+)/i, '') |
| 200 | + .replace(/\s*[-–—].*$/, '') |
| 201 | + .replace(/!.*$/, ''); |
| 202 | + const out = []; |
| 203 | + for (let part of after.split(/\s+(?:and|&)\s+/i)) { |
| 204 | + part = part.replace(/[,:(\[].*$/, '').trim(); |
| 205 | + const words = []; |
| 206 | + for (const w of part.split(/\s+/).slice(0, 3)) { |
| 207 | + const cw = w.replace(/[!?.]+$/, ''); |
| 208 | + if (cw && /^[A-ZÀ-ɏ]/.test(cw)) words.push(cw); |
| 209 | + else break; |
| 210 | + } |
| 211 | + if (words.length >= 2) { |
| 212 | + const name = cleanName(words.join(' ')); |
| 213 | + if (looksLikePerson(name) && !out.includes(name)) out.push(name); |
| 214 | + } |
| 215 | + } |
| 216 | + return out; |
| 217 | +} |
| 218 | + |
| 219 | +// "Guest Bio:" / "Bio:" header followed by "Name is/works/..." paragraph. |
| 220 | +function guestsFromBio(body) { |
| 221 | + const out = []; |
| 222 | + const hdr = body.match(/^(?:Guest Bio|Bio)\s*:?\s*$/im) || body.match(/(?:Guest Bio|Bio)\s*:/i); |
| 223 | + if (!hdr) return out; |
| 224 | + const after = body.slice(body.indexOf(hdr[0]) + hdr[0].length); |
| 225 | + for (const para of after.split(/\n\s*\n/).slice(0, 2)) { |
| 226 | + const m = para |
| 227 | + .trim() |
| 228 | + .match(/^(?:Dr\.\s+|Prof\.\s+)?([A-ZÀ-ɏ][\w'À-ɏ-]+(?:\s+[A-Z]\.)?(?:\s+[A-ZÀ-ɏ][\w'À-ɏ-]+){1,3})(?=\s+(?:is|was|are|has|works|joined|lives|serves|brings|currently|helps|leads|comes|spent|built|started|founded|created|focuses|specializes|manages|develops|writes|teaches|runs|hosts|holds|received|earned|wrote|published|blogs)\b|\s*,)/); |
| 229 | + if (m) { |
| 230 | + const name = cleanName(m[1]); |
| 231 | + if (looksLikePerson(name) && !out.includes(name)) out.push(name); |
| 232 | + } |
| 233 | + } |
| 234 | + return out; |
| 235 | +} |
| 236 | + |
| 237 | +function extractGuests(title, body) { |
| 238 | + const guests = []; |
| 239 | + for (const g of [...guestsFromBio(body), ...guestsFromTitle(title)]) { |
| 240 | + if (!guests.includes(g) && g !== HOST) guests.push(g); |
| 241 | + } |
| 242 | + return guests; |
| 243 | +} |
| 244 | + |
| 245 | +// --- YAML ------------------------------------------------------------------ |
| 246 | + |
| 247 | +// Plain scalar unless the value would break YAML (matches the existing files, |
| 248 | +// which leave titles with parens/apostrophes/& unquoted). |
| 249 | +function yamlScalar(v) { |
| 250 | + const s = String(v); |
| 251 | + if (s === '' || /[:#]\s/.test(s) || /[:#]$/.test(s) || /^[\s>|*&!%@`"'\[\]{},]/.test(s)) { |
| 252 | + return `"${s.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`; |
| 253 | + } |
| 254 | + return s; |
| 255 | +} |
| 256 | + |
| 257 | +// --- Existing-file index --------------------------------------------------- |
| 258 | + |
| 259 | +function buildIndex() { |
| 260 | + const idx = { urls: new Set(), guids: new Set(), episodes: new Set() }; |
| 261 | + for (const file of fs.readdirSync(PODCAST_DIR)) { |
| 262 | + if (!file.endsWith('.md') || file === '_index.md') continue; |
| 263 | + const c = fs.readFileSync(path.join(PODCAST_DIR, file), 'utf-8'); |
| 264 | + if (!c.includes('mcdn.podbean.com')) continue; // modern (Podbean) episodes only |
| 265 | + const url = c.match(/^podcast_url:\s*"?([^"\r\n]+)"?/m); |
| 266 | + if (url) idx.urls.add(url[1].trim()); |
| 267 | + const guid = c.match(/^guid:\s*"?([^"\r\n]+)"?/m); |
| 268 | + if (guid) idx.guids.add(guid[1].trim()); |
| 269 | + const ep = c.match(/^episode:\s*(\d+)/m); |
| 270 | + if (ep) idx.episodes.add(parseInt(ep[1], 10)); |
| 271 | + } |
| 272 | + return idx; |
| 273 | +} |
| 274 | + |
| 275 | +// --- Build one episode file ------------------------------------------------ |
| 276 | + |
| 277 | +function buildEpisode(item) { |
| 278 | + const enclosure = attr(item, 'enclosure', 'url').trim(); |
| 279 | + if (!enclosure) return null; |
| 280 | + |
| 281 | + const rawTitle = tag(item, 'title').trim(); |
| 282 | + const guidRaw = tag(item, 'guid').trim(); |
| 283 | + const pubDate = tag(item, 'pubDate').trim(); |
| 284 | + const itunesEp = parseInt(tag(item, 'itunes:episode'), 10) || null; |
| 285 | + const contentHtml = tag(item, 'content:encoded') || tag(item, 'description'); |
| 286 | + |
| 287 | + const episode = resolveEpisode(enclosure, itunesEp); |
| 288 | + const { date, iso, y, m } = parseDate(pubDate); |
| 289 | + const slug = TITLE_PREFIX.toLowerCase().trim().replace(/\s+/g, '-') + '-' + slugify(rawTitle); |
| 290 | + const filename = `${date}-${slug}.md`; |
| 291 | + |
| 292 | + const ytMatch = contentHtml.match(/youtu\.be\/([A-Za-z0-9_-]{6,})/); |
| 293 | + const youtube = ytMatch ? ytMatch[1] : null; |
| 294 | + |
| 295 | + const body = stripBoilerplate(htmlToMarkdown(contentHtml)); |
| 296 | + const guests = extractGuests(rawTitle, body); |
| 297 | + const authors = [HOST, ...guests]; |
| 298 | + |
| 299 | + const fm = []; |
| 300 | + fm.push(`title: ${yamlScalar(TITLE_PREFIX + rawTitle)}`); |
| 301 | + fm.push(`author: ${yamlScalar(HOST)}`); |
| 302 | + fm.push('authors:'); |
| 303 | + for (const a of authors) fm.push(` - ${yamlScalar(a)}`); |
| 304 | + fm.push(`date: "${iso}"`); |
| 305 | + fm.push(`podcast_url: "${enclosure}"`); |
| 306 | + if (episode != null) fm.push(`episode: ${episode}`); |
| 307 | + if (youtube) fm.push(`youtube: ${youtube}`); |
| 308 | + if (guidRaw) fm.push(`guid: ${yamlScalar(guidRaw)}`); |
| 309 | + fm.push('aliases:'); |
| 310 | + fm.push(` - /${y}/${m}/${slug}/`); |
| 311 | + |
| 312 | + const content = `---\n${fm.join('\n')}\n---\n\n${body}\n`; |
| 313 | + return { filename, content, episode, enclosure, guid: guidRaw, title: rawTitle }; |
| 314 | +} |
| 315 | + |
| 316 | +// --- Main ------------------------------------------------------------------ |
| 317 | + |
| 318 | +async function main() { |
| 319 | + console.log(`Fetching ${FEED_URL} ...`); |
| 320 | + const res = await fetch(FEED_URL); |
| 321 | + if (!res.ok) throw new Error(`Feed fetch failed: ${res.status} ${res.statusText}`); |
| 322 | + const xml = await res.text(); |
| 323 | + |
| 324 | + const items = xml.match(/<item>[\s\S]*?<\/item>/g) || []; |
| 325 | + console.log(`Feed items: ${items.length}`); |
| 326 | + |
| 327 | + const idx = buildIndex(); |
| 328 | + console.log(`Existing modern episodes indexed: urls=${idx.urls.size} guids=${idx.guids.size} episodes=${idx.episodes.size}`); |
| 329 | + |
| 330 | + let added = 0; |
| 331 | + let skipped = 0; |
| 332 | + const writes = []; |
| 333 | + |
| 334 | + for (const item of items) { |
| 335 | + const ep = buildEpisode(item); |
| 336 | + if (!ep) { skipped++; continue; } |
| 337 | + |
| 338 | + const exists = |
| 339 | + idx.urls.has(ep.enclosure) || |
| 340 | + (ep.guid && idx.guids.has(ep.guid)) || |
| 341 | + (ep.episode != null && idx.episodes.has(ep.episode)); |
| 342 | + if (exists) { skipped++; continue; } |
| 343 | + |
| 344 | + const dest = path.join(PODCAST_DIR, ep.filename); |
| 345 | + if (fs.existsSync(dest)) { skipped++; continue; } |
| 346 | + |
| 347 | + writes.push(ep); |
| 348 | + // Reserve keys so a duplicate item in the same run can't double-write. |
| 349 | + idx.urls.add(ep.enclosure); |
| 350 | + if (ep.guid) idx.guids.add(ep.guid); |
| 351 | + if (ep.episode != null) idx.episodes.add(ep.episode); |
| 352 | + } |
| 353 | + |
| 354 | + writes.sort((a, b) => a.filename.localeCompare(b.filename)); |
| 355 | + for (const ep of writes) { |
| 356 | + console.log(`${DRY_RUN ? '[dry-run] ' : ''}+ ${ep.filename} (episode ${ep.episode ?? '?'})`); |
| 357 | + if (!DRY_RUN) fs.writeFileSync(path.join(PODCAST_DIR, ep.filename), ep.content, 'utf-8'); |
| 358 | + added++; |
| 359 | + } |
| 360 | + |
| 361 | + console.log(`\n${DRY_RUN ? 'Would add' : 'Added'}: ${added} Skipped (already present): ${skipped}`); |
| 362 | +} |
| 363 | + |
| 364 | +main().catch((err) => { |
| 365 | + console.error('Sync failed:', err); |
| 366 | + process.exit(1); |
| 367 | +}); |
0 commit comments