Skip to content

Commit 9fcc7d7

Browse files
HeyItsGilbertclaude
andcommitted
feat(podcast): add automated RSS feed sync + backfill eps 221-234
Add an add-only sync that generates episode files from the Podbean feed, plus a weekly scheduled Action that auto-commits new episodes (mirroring discourse-sync.yml). The first run backfills the gap between the repo (ep 220) and the feed (ep 234). The feed turned out to carry the full archive rather than a 10-item window, so idempotency is keyed on the enclosure URL (every existing file has it) and the episode number is parsed from the Podbean filename, since itunes:episode runs one ahead of the repo convention. This makes the separate WS3 scrape unnecessary; ADR 0003 and the plan are updated to record the corrected design. Guest extraction stays conservative (high-confidence only) because the Action commits without review. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Qv5XTRCp9s9cH6SikDkrnS
1 parent da51c9f commit 9fcc7d7

18 files changed

Lines changed: 939 additions & 8 deletions

File tree

Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
// .github/scripts/sync-podcast-feed.js
2+
// Incremental sync of The PowerShell Podcast (Podbean) into content/podcast/.
3+
//
4+
// Design: see docs/adr/0003-incremental-podcast-sync.md.
5+
// The feed currently carries the full archive, but this script is strictly
6+
// ADD-ONLY: it generates an episode file only when no existing file matches,
7+
// and never edits or deletes. That makes a full pass over the feed safe and
8+
// lets the first run backfill the whole gap between the repo and the feed.
9+
//
10+
// Idempotency key (in priority order): enclosure URL, then RSS guid, then
11+
// episode number. The enclosure URL is the universal key — every existing
12+
// modern file carries it as `podcast_url`.
13+
//
14+
// Run `node .github/scripts/sync-podcast-feed.js --dry-run` to preview.
15+
16+
import fetch from 'node-fetch';
17+
import fs from 'fs';
18+
import path from 'path';
19+
20+
const FEED_URL = 'https://feed.podbean.com/powershellpodcast/feed.xml';
21+
const PODCAST_DIR = path.join('content', 'podcast');
22+
const HOST = 'Andrew Pla';
23+
const TITLE_PREFIX = 'The PowerShell Podcast ';
24+
const DRY_RUN = process.argv.includes('--dry-run');
25+
26+
// Recurring boilerplate links lifted/dropped from episode bodies. A body line
27+
// (typically a "Resource Links" bullet) is dropped if it contains any of these.
28+
// Keep this list tight — episode-specific links must survive. youtu.be is
29+
// dropped because the id is lifted into the `youtube` frontmatter field.
30+
const BOILERPLATE = [
31+
/andrewpla\.tech/i,
32+
/discord\.gg\/pdq/i,
33+
/powershellsummit\.org/i,
34+
/youtu\.be\//i,
35+
];
36+
37+
// --- XML helpers -----------------------------------------------------------
38+
39+
function decodeEntities(str) {
40+
return str
41+
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
42+
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
43+
.replace(/&apos;/g, "'")
44+
.replace(/&quot;/g, '"')
45+
.replace(/&lt;/g, '<')
46+
.replace(/&gt;/g, '>')
47+
.replace(/&nbsp;/g, ' ')
48+
.replace(/&amp;/g, '&');
49+
}
50+
51+
// Read the text of a single child tag from an <item> block, unwrapping CDATA.
52+
function tag(item, name) {
53+
const m = item.match(new RegExp(`<${name}(?:\\s[^>]*)?>([\\s\\S]*?)</${name}>`, 'i'));
54+
if (!m) return '';
55+
let v = m[1].trim();
56+
const cdata = v.match(/^<!\[CDATA\[([\s\S]*?)\]\]>$/);
57+
if (cdata) return cdata[1];
58+
return decodeEntities(v);
59+
}
60+
61+
function attr(item, tagName, attrName) {
62+
const m = item.match(new RegExp(`<${tagName}\\b[^>]*\\b${attrName}=["']([^"']*)["']`, 'i'));
63+
return m ? m[1] : '';
64+
}
65+
66+
// --- Slug + date -----------------------------------------------------------
67+
68+
// Mirrors Hugo's default slugify closely enough to match the existing modern
69+
// filenames: lowercase, drop apostrophes, collapse runs of other non-alnum
70+
// characters to single hyphens, trim hyphens.
71+
function slugify(s) {
72+
return s
73+
.toLowerCase()
74+
.replace(/[']/g, '')
75+
.replace(/[^a-z0-9]+/g, '-')
76+
.replace(/^-+|-+$/g, '');
77+
}
78+
79+
function pad2(n) {
80+
return String(n).padStart(2, '0');
81+
}
82+
83+
// pubDate -> { date: 'YYYY-MM-DD', iso: 'YYYY-MM-DDTHH:MM:SS+00:00', y, m }
84+
function parseDate(pubDate) {
85+
const d = new Date(pubDate);
86+
const y = d.getUTCFullYear();
87+
const m = pad2(d.getUTCMonth() + 1);
88+
const day = pad2(d.getUTCDate());
89+
const iso = `${y}-${m}-${day}T${pad2(d.getUTCHours())}:${pad2(d.getUTCMinutes())}:${pad2(d.getUTCSeconds())}+00:00`;
90+
return { date: `${y}-${m}-${day}`, iso, y, m };
91+
}
92+
93+
// --- Episode number from enclosure filename --------------------------------
94+
95+
// The repo's episode-number convention is the number embedded in the Podbean
96+
// filename (`..._episode_NNN_...`), NOT itunes:episode (which runs one ahead).
97+
// Most filenames delimit the number (`episode_220_Morten`), but specials glue a
98+
// random suffix onto it (`episode_2298xv9d`). For the ambiguous case, pick the
99+
// digit-prefix closest to itunes:episode (the numbers track within ~1).
100+
function resolveEpisode(enclosureUrl, itunesEp) {
101+
const clean = enclosureUrl.match(/episode[_-](\d+)[_-]/i);
102+
if (clean) return parseInt(clean[1], 10);
103+
104+
const greedy = enclosureUrl.match(/episode[_-](\d+)/i);
105+
if (!greedy) return null;
106+
const digits = greedy[1];
107+
if (!itunesEp) return parseInt(digits, 10);
108+
109+
let best = null;
110+
for (let len = digits.length; len >= 1; len--) {
111+
const cand = parseInt(digits.slice(0, len), 10);
112+
if (Math.abs(cand - itunesEp) <= 2) { best = cand; break; }
113+
}
114+
return best != null ? best : parseInt(digits, 10);
115+
}
116+
117+
// --- HTML body -> markdown -------------------------------------------------
118+
119+
function htmlToMarkdown(html) {
120+
let md = html;
121+
122+
// Lists
123+
md = md.replace(/<\/?ul[^>]*>/gi, '\n');
124+
md = md.replace(/<\/?ol[^>]*>/gi, '\n');
125+
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, t) => `- ${t.trim()}\n`);
126+
127+
// Links, emphasis
128+
md = md.replace(/<a\b[^>]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
129+
md = md.replace(/<(strong|b)\b[^>]*>([\s\S]*?)<\/\1>/gi, '**$2**');
130+
md = md.replace(/<(em|i)\b[^>]*>([\s\S]*?)<\/\1>/gi, '*$2*');
131+
132+
// Block + break tags
133+
md = md.replace(/<br\s*\/?>/gi, '\n');
134+
md = md.replace(/<\/p>/gi, '\n\n');
135+
md = md.replace(/<p[^>]*>/gi, '');
136+
137+
// Strip anything left, decode, tidy
138+
md = md.replace(/<\/?[a-zA-Z][^>]*>/g, '');
139+
md = decodeEntities(md);
140+
141+
md = md
142+
.split('\n')
143+
.map((line) => line.replace(/ /g, ' ').replace(/[ \t]+$/g, '').trim())
144+
.join('\n');
145+
md = md.replace(/\n{3,}/g, '\n\n');
146+
// Tighten lists: drop blank lines between consecutive bullets.
147+
md = md.replace(/(^- .*)\n\n(?=- )/gm, '$1\n');
148+
return md.trim();
149+
}
150+
151+
function stripBoilerplate(md) {
152+
const kept = md
153+
.split('\n')
154+
.filter((line) => !BOILERPLATE.some((re) => re.test(line)));
155+
156+
// Drop a trailing "Resource Links" header left empty after stripping.
157+
while (kept.length) {
158+
const last = kept[kept.length - 1].trim();
159+
if (last === '' || /^resource links?:?$/i.test(last)) kept.pop();
160+
else break;
161+
}
162+
return kept.join('\n').replace(/\n{3,}/g, '\n\n').trim();
163+
}
164+
165+
// --- Conservative guest extraction -----------------------------------------
166+
// Ported from scripts/update-podcast-authors.py. High-confidence only; a missed
167+
// guest degrades to no byline, a wrong guest would be a bad byline (ADR 0003).
168+
169+
const NOT_A_PERSON = /^(?:The|A|An|In|This|For|With|From|On|PowerShell|Microsoft|Windows|Azure|DevOps|Cloud|GitHub|AWS|Episode|Podcast|Session|Roundtable|Community|Tonight|Join|Listen|Watch|Learn|Get|Set|Bar|Summit|Meet|Introducing|Featuring|Welcome|Just)\b/i;
170+
171+
function looksLikePerson(name) {
172+
if (!name || name.length < 4) return false;
173+
if (NOT_A_PERSON.test(name)) return false;
174+
const words = name.split(/\s+/);
175+
if (words.length < 2) return false;
176+
if (words.filter((w) => /^[A-ZÀ-ɏ]/.test(w)).length < 2) return false;
177+
// Reject emphasis words (ALSO, PLUS) but allow initials like "B."
178+
if (words.some((w) => w.length > 1 && w === w.toUpperCase() && !w.endsWith('.'))) return false;
179+
const low = name.toLowerCase();
180+
return !['http', 'www.', 'episode', 'podcast', 'powershell', 'microsoft', 'summit'].some((b) => low.includes(b));
181+
}
182+
183+
function cleanName(raw) {
184+
return raw
185+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
186+
.replace(/<[^>]+>/g, '')
187+
.replace(/[*_`]/g, '')
188+
.replace(/^["']|["']$/g, '')
189+
.replace(/[.,;:!?]+$/g, '')
190+
.replace(/^(?:MVPs?\s+|Dr\.\s+|Prof\.\s+)/i, '')
191+
.trim();
192+
}
193+
194+
// "...with Name", "...with Name1 and Name2"
195+
function guestsFromTitle(title) {
196+
const m = title.match(/\bwith\s+(.+)$/i);
197+
if (!m) return [];
198+
let after = m[1]
199+
.replace(/^(?:MVPs?\s+|special\s+guest\s+host\s+)/i, '')
200+
.replace(/\s*[-].*$/, '')
201+
.replace(/!.*$/, '');
202+
const out = [];
203+
for (let part of after.split(/\s+(?:and|&)\s+/i)) {
204+
part = part.replace(/[,:(\[].*$/, '').trim();
205+
const words = [];
206+
for (const w of part.split(/\s+/).slice(0, 3)) {
207+
const cw = w.replace(/[!?.]+$/, '');
208+
if (cw && /^[A-ZÀ-ɏ]/.test(cw)) words.push(cw);
209+
else break;
210+
}
211+
if (words.length >= 2) {
212+
const name = cleanName(words.join(' '));
213+
if (looksLikePerson(name) && !out.includes(name)) out.push(name);
214+
}
215+
}
216+
return out;
217+
}
218+
219+
// "Guest Bio:" / "Bio:" header followed by "Name is/works/..." paragraph.
220+
function guestsFromBio(body) {
221+
const out = [];
222+
const hdr = body.match(/^(?:Guest Bio|Bio)\s*:?\s*$/im) || body.match(/(?:Guest Bio|Bio)\s*:/i);
223+
if (!hdr) return out;
224+
const after = body.slice(body.indexOf(hdr[0]) + hdr[0].length);
225+
for (const para of after.split(/\n\s*\n/).slice(0, 2)) {
226+
const m = para
227+
.trim()
228+
.match(/^(?:Dr\.\s+|Prof\.\s+)?([A-ZÀ-ɏ][\w'À-ɏ-]+(?:\s+[A-Z]\.)?(?:\s+[A-ZÀ-ɏ][\w'À-ɏ-]+){1,3})(?=\s+(?:is|was|are|has|works|joined|lives|serves|brings|currently|helps|leads|comes|spent|built|started|founded|created|focuses|specializes|manages|develops|writes|teaches|runs|hosts|holds|received|earned|wrote|published|blogs)\b|\s*,)/);
229+
if (m) {
230+
const name = cleanName(m[1]);
231+
if (looksLikePerson(name) && !out.includes(name)) out.push(name);
232+
}
233+
}
234+
return out;
235+
}
236+
237+
function extractGuests(title, body) {
238+
const guests = [];
239+
for (const g of [...guestsFromBio(body), ...guestsFromTitle(title)]) {
240+
if (!guests.includes(g) && g !== HOST) guests.push(g);
241+
}
242+
return guests;
243+
}
244+
245+
// --- YAML ------------------------------------------------------------------
246+
247+
// Plain scalar unless the value would break YAML (matches the existing files,
248+
// which leave titles with parens/apostrophes/& unquoted).
249+
function yamlScalar(v) {
250+
const s = String(v);
251+
if (s === '' || /[:#]\s/.test(s) || /[:#]$/.test(s) || /^[\s>|*&!%@`"'\[\]{},]/.test(s)) {
252+
return `"${s.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`;
253+
}
254+
return s;
255+
}
256+
257+
// --- Existing-file index ---------------------------------------------------
258+
259+
function buildIndex() {
260+
const idx = { urls: new Set(), guids: new Set(), episodes: new Set() };
261+
for (const file of fs.readdirSync(PODCAST_DIR)) {
262+
if (!file.endsWith('.md') || file === '_index.md') continue;
263+
const c = fs.readFileSync(path.join(PODCAST_DIR, file), 'utf-8');
264+
if (!c.includes('mcdn.podbean.com')) continue; // modern (Podbean) episodes only
265+
const url = c.match(/^podcast_url:\s*"?([^"\r\n]+)"?/m);
266+
if (url) idx.urls.add(url[1].trim());
267+
const guid = c.match(/^guid:\s*"?([^"\r\n]+)"?/m);
268+
if (guid) idx.guids.add(guid[1].trim());
269+
const ep = c.match(/^episode:\s*(\d+)/m);
270+
if (ep) idx.episodes.add(parseInt(ep[1], 10));
271+
}
272+
return idx;
273+
}
274+
275+
// --- Build one episode file ------------------------------------------------
276+
277+
function buildEpisode(item) {
278+
const enclosure = attr(item, 'enclosure', 'url').trim();
279+
if (!enclosure) return null;
280+
281+
const rawTitle = tag(item, 'title').trim();
282+
const guidRaw = tag(item, 'guid').trim();
283+
const pubDate = tag(item, 'pubDate').trim();
284+
const itunesEp = parseInt(tag(item, 'itunes:episode'), 10) || null;
285+
const contentHtml = tag(item, 'content:encoded') || tag(item, 'description');
286+
287+
const episode = resolveEpisode(enclosure, itunesEp);
288+
const { date, iso, y, m } = parseDate(pubDate);
289+
const slug = TITLE_PREFIX.toLowerCase().trim().replace(/\s+/g, '-') + '-' + slugify(rawTitle);
290+
const filename = `${date}-${slug}.md`;
291+
292+
const ytMatch = contentHtml.match(/youtu\.be\/([A-Za-z0-9_-]{6,})/);
293+
const youtube = ytMatch ? ytMatch[1] : null;
294+
295+
const body = stripBoilerplate(htmlToMarkdown(contentHtml));
296+
const guests = extractGuests(rawTitle, body);
297+
const authors = [HOST, ...guests];
298+
299+
const fm = [];
300+
fm.push(`title: ${yamlScalar(TITLE_PREFIX + rawTitle)}`);
301+
fm.push(`author: ${yamlScalar(HOST)}`);
302+
fm.push('authors:');
303+
for (const a of authors) fm.push(` - ${yamlScalar(a)}`);
304+
fm.push(`date: "${iso}"`);
305+
fm.push(`podcast_url: "${enclosure}"`);
306+
if (episode != null) fm.push(`episode: ${episode}`);
307+
if (youtube) fm.push(`youtube: ${youtube}`);
308+
if (guidRaw) fm.push(`guid: ${yamlScalar(guidRaw)}`);
309+
fm.push('aliases:');
310+
fm.push(` - /${y}/${m}/${slug}/`);
311+
312+
const content = `---\n${fm.join('\n')}\n---\n\n${body}\n`;
313+
return { filename, content, episode, enclosure, guid: guidRaw, title: rawTitle };
314+
}
315+
316+
// --- Main ------------------------------------------------------------------
317+
318+
async function main() {
319+
console.log(`Fetching ${FEED_URL} ...`);
320+
const res = await fetch(FEED_URL);
321+
if (!res.ok) throw new Error(`Feed fetch failed: ${res.status} ${res.statusText}`);
322+
const xml = await res.text();
323+
324+
const items = xml.match(/<item>[\s\S]*?<\/item>/g) || [];
325+
console.log(`Feed items: ${items.length}`);
326+
327+
const idx = buildIndex();
328+
console.log(`Existing modern episodes indexed: urls=${idx.urls.size} guids=${idx.guids.size} episodes=${idx.episodes.size}`);
329+
330+
let added = 0;
331+
let skipped = 0;
332+
const writes = [];
333+
334+
for (const item of items) {
335+
const ep = buildEpisode(item);
336+
if (!ep) { skipped++; continue; }
337+
338+
const exists =
339+
idx.urls.has(ep.enclosure) ||
340+
(ep.guid && idx.guids.has(ep.guid)) ||
341+
(ep.episode != null && idx.episodes.has(ep.episode));
342+
if (exists) { skipped++; continue; }
343+
344+
const dest = path.join(PODCAST_DIR, ep.filename);
345+
if (fs.existsSync(dest)) { skipped++; continue; }
346+
347+
writes.push(ep);
348+
// Reserve keys so a duplicate item in the same run can't double-write.
349+
idx.urls.add(ep.enclosure);
350+
if (ep.guid) idx.guids.add(ep.guid);
351+
if (ep.episode != null) idx.episodes.add(ep.episode);
352+
}
353+
354+
writes.sort((a, b) => a.filename.localeCompare(b.filename));
355+
for (const ep of writes) {
356+
console.log(`${DRY_RUN ? '[dry-run] ' : ''}+ ${ep.filename} (episode ${ep.episode ?? '?'})`);
357+
if (!DRY_RUN) fs.writeFileSync(path.join(PODCAST_DIR, ep.filename), ep.content, 'utf-8');
358+
added++;
359+
}
360+
361+
console.log(`\n${DRY_RUN ? 'Would add' : 'Added'}: ${added} Skipped (already present): ${skipped}`);
362+
}
363+
364+
main().catch((err) => {
365+
console.error('Sync failed:', err);
366+
process.exit(1);
367+
});

0 commit comments

Comments
 (0)