Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions scripts/extract-news-metadata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
*/

import { readFileSync, writeFileSync, readdirSync, mkdirSync } from 'fs';
import { join, dirname } from 'path';
import { join, dirname, relative } from 'path';
import { fileURLToPath } from 'url';

const __filename = fileURLToPath(import.meta.url);
Expand Down Expand Up @@ -56,10 +56,22 @@ interface JsonLdArticle {
url?: string;
}

function collectNewsFiles(dir: string): string[] {
const result: string[] = [];
for (const entry of readdirSync(dir, { withFileTypes: true })) {
if (entry.isDirectory()) {
result.push(...collectNewsFiles(join(dir, entry.name)));
} else if (entry.name.endsWith('.html') && !entry.name.startsWith('index')) {
result.push(join(dir, entry.name));
}
}
return result;
}

function extractMetadata(): void {
const newsDir = join(ROOT, 'news');
const files = readdirSync(newsDir)
.filter((f: string) => f.endsWith('.html') && !f.startsWith('index'));
const allFilePaths = collectNewsFiles(newsDir);
const files = allFilePaths.map((fp) => relative(newsDir, fp));

Comment on lines +73 to 75
Copy link

Copilot AI Mar 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

relative(newsDir, fp) will return platform-specific separators (e.g., \ on Windows). Since file and derived slug are persisted into data/news-articles.json and used to represent URL-like paths, this can produce backslash-containing slugs and inconsistent outputs across OSes. Normalize the relative paths to POSIX-style (/) before storing them (and before computing slug).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 338170c. Added sep to the path named imports and changed line 74 to .split(sep).join('/') so file and derived slug always use forward-slash separators in news-articles.json, regardless of OS.

const articles: ArticleMetadata[] = [];

Expand Down
36 changes: 28 additions & 8 deletions scripts/generate-news-indexes/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -311,29 +311,49 @@ export function extractTags(content: string): string[] {
return tags.slice(0, 4); // Max 4 tags for display
}

/**
* Collect all article HTML file paths recursively from a directory.
* Supports date-based subdirectory structure: news/{year}/{month}/article.html
*/
function collectArticleFiles(dir: string): string[] {
const result: string[] = [];
const entries = fs.readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
if (entry.isDirectory()) {
result.push(...collectArticleFiles(path.join(dir, entry.name)));
} else if (entry.isFile() && entry.name.endsWith('.html') && !entry.name.startsWith('index')) {
result.push(path.join(dir, entry.name));
}
}
return result;
}

/**
* Scan news directory and group articles by language.
* Supports date-based subdirectory structure: news/{year}/{month}/article.html
*/
export function scanNewsArticles(): Record<string, NewsArticleMetadata[]> {
console.log('\nπŸ“° Scanning for articles...');

const files: string[] = fs.readdirSync(NEWS_DIR)
.filter((file) => file.endsWith('.html'))
.filter((file) => !file.startsWith('index')); // Exclude index files
const filePaths: string[] = collectArticleFiles(NEWS_DIR);

console.log(` Found ${files.length} article files`);
console.log(` Found ${filePaths.length} article files`);

// Initialize buckets for all 14 supported languages
const articlesByLang: Record<string, NewsArticleMetadata[]> = Object.fromEntries(
Object.keys(LANGUAGES).map((lang) => [lang, []]),
);

files.forEach((file) => {
const filePath: string = path.join(NEWS_DIR, file);
filePaths.forEach((filePath) => {
const metadata: NewsArticleMetadata | null = parseArticleMetadata(filePath);

if (metadata && articlesByLang[metadata.lang]) {
articlesByLang[metadata.lang]!.push(metadata);
if (metadata) {
// Set slug to relative path from NEWS_DIR (e.g., "2026/02/2026-02-13-article-en.html")
metadata.slug = path.relative(NEWS_DIR, filePath).split(path.sep).join('/');

if (articlesByLang[metadata.lang]) {
articlesByLang[metadata.lang]!.push(metadata);
}
}
});

Expand Down
65 changes: 38 additions & 27 deletions scripts/generate-sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ function getFileModTime(filePath: string): string {

/**
* Get news articles with metadata.
* Supports date-based subdirectory structure: news/{year}/{month}/article.html
*/
function getNewsArticles(): ArticleGroup[] {
console.log('πŸ“° Scanning news directory...');
Expand All @@ -144,39 +145,49 @@ function getNewsArticles(): ArticleGroup[] {
return [];
}

const files = fs
.readdirSync(NEWS_DIR)
.filter((file) => file.endsWith('.html') && file !== 'index.html' && !file.startsWith('index_'));

console.log(` Found ${files.length} news articles`);

// Group articles by base slug (without language suffix)
const articles = new Map<string, ArticleGroup>();

files.forEach((file) => {
const match = file.match(/^(.+?)-(en|sv|da|no|fi|de|fr|es|nl|ar|he|ja|ko|zh)\.html$/);
if (match) {
const baseSlug = match[1]!;
const lang = match[2]!;
const filePath = path.join(NEWS_DIR, file);
const fileModTime = getFileModTime(filePath);

if (!articles.has(baseSlug)) {
articles.set(baseSlug, {
baseSlug,
languages: [],
lastmod: fileModTime,
});
} else {
const article = articles.get(baseSlug)!;
if (!article.lastmod || new Date(fileModTime) > new Date(article.lastmod)) {
article.lastmod = fileModTime;
function scanDir(dir: string): void {
const entries = fs.readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
if (entry.isDirectory()) {
scanDir(path.join(dir, entry.name));
Comment on lines +151 to +155
Copy link

Copilot AI Mar 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The recursive scan inserts into articles in filesystem traversal order (readdirSync), and Array.from(articles.values()) preserves that order. With recursive traversal this can make sitemap output order non-deterministic across platforms/filesystems, causing noisy diffs. Consider sorting entries (e.g., by entry.name) before iterating and/or sorting the final ArticleGroup[] by baseSlug before emitting URLs.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 338170c. readdirSync entries are now sorted by name before iterating in scanDir(), and Array.from(articles.values()) is sorted by baseSlug before being returned β€” making sitemap output fully deterministic across platforms and filesystems.

} else if (entry.isFile() && entry.name !== 'index.html' && !entry.name.startsWith('index_') && entry.name.endsWith('.html')) {
const file = entry.name;
const match = file.match(/^(.+?)-(en|sv|da|no|fi|de|fr|es|nl|ar|he|ja|ko|zh)\.html$/);
if (match) {
const baseSlug = match[1]!;
const lang = match[2]!;
const filePath = path.join(dir, file);
const fileModTime = getFileModTime(filePath);

// Include subdirectory prefix in baseSlug (e.g., "2026/02/2026-02-13-article")
const relDir = path.relative(NEWS_DIR, dir).split(path.sep).join('/');
const fullBaseSlug = relDir ? `${relDir}/${baseSlug}` : baseSlug;

if (!articles.has(fullBaseSlug)) {
articles.set(fullBaseSlug, {
baseSlug: fullBaseSlug,
languages: [],
lastmod: fileModTime,
});
} else {
const article = articles.get(fullBaseSlug)!;
if (!article.lastmod || new Date(fileModTime) > new Date(article.lastmod)) {
article.lastmod = fileModTime;
}
}

articles.get(fullBaseSlug)!.languages.push(lang);
}
}

articles.get(baseSlug)!.languages.push(lang);
}
});
}

scanDir(NEWS_DIR);

console.log(` Found ${articles.size} news articles`);
Copy link

Copilot AI Mar 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This log line now reports articles.size, which is the number of language-groups (base slugs), not the number of article files. Since the output label still says "news articles", it can be misleading during troubleshooting; consider renaming it to "article groups" or logging both group count and total file count.

Suggested change
console.log(` Found ${articles.size} news articles`);
console.log(` Found ${articles.size} news article groups`);

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in bcb38c2. The log line now reads Found ${articles.size} news article groups to make clear that the count is language-group (base-slug) count, not individual HTML file count.


return Array.from(articles.values());
}
Expand Down
4 changes: 2 additions & 2 deletions tests/sitemap-generation.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ describe('Sitemap Generation', () => {
});

it('should include articles in multiple languages', () => {
// Check for language-specific news articles (year-agnostic)
// Check for language-specific news articles (flat or date-based directory structure)
const languages: readonly string[] = ['en', 'sv', 'da', 'no', 'fi', 'de', 'fr', 'es', 'nl', 'ar', 'he', 'ja', 'ko', 'zh'];
const foundLanguages = new Set<string>();

languages.forEach(lang => {
const pattern = new RegExp(`news/\\d{4}-\\d{2}-\\d{2}-.+-${lang}\\.html`);
const pattern = new RegExp(`news/(?:\\d{4}/\\d{2}/)?\\d{4}-\\d{2}-\\d{2}-.+-${lang}\\.html`);
if (pattern.test(sitemapContent)) {
Comment on lines +121 to 127
Copy link

Copilot AI Mar 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The language-specific pattern was broadened for flat + date-based layouts, but the earlier "should include news articles" regex still only matches /news/YYYY-MM-DD-... and will fail once any articles are actually placed under news/YYYY/MM/. Please broaden that earlier assertion in the same way so tests reflect the intended dual-layout support.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in bcb38c2. The "should include news articles" URL regex now accepts both flat (news/YYYY-MM-DD-*) and date-based (news/YYYY/MM/YYYY-MM-DD-*) paths, matching the pattern already used in the language-variant test below it.

foundLanguages.add(lang);
}
Expand Down
23 changes: 16 additions & 7 deletions vite.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,32 @@
import { defineConfig } from 'vite';
import sri from 'vite-plugin-sri-gen';
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';

/**
* Auto-discover news article HTML files from the news/ directory.
* Auto-discover news article HTML files from the news/ directory recursively.
* Supports date-based subdirectory structure: news/{year}/{month}/article.html
* This prevents new articles from being excluded from the Vite build
* (and thus missing from S3 deployment).
*/
function discoverNewsArticles() {
const newsDir = new URL('./news', import.meta.url);
const newsDir = fileURLToPath(new URL('./news', import.meta.url));
const entries = {};
if (fs.existsSync(newsDir)) {
for (const file of fs.readdirSync(newsDir)) {
if (file.endsWith('.html') && !file.startsWith('index')) {
const name = file.replace('.html', '');
entries[`news/${name}`] = `./news/${file}`;

function scanDir(dir, relPrefix) {
if (!fs.existsSync(dir)) return;
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
if (entry.isDirectory()) {
scanDir(path.join(dir, entry.name), relPrefix + entry.name + '/');
} else if (entry.name.endsWith('.html') && !entry.name.startsWith('index')) {
const name = relPrefix + entry.name.replace('.html', '');
entries[`news/${name}`] = `./news/${relPrefix}${entry.name}`;
}
}
}

scanDir(newsDir, '');
return entries;
}

Expand Down
Loading