forrtproject · LukasWallrich · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
@@ -269,6 +269,14 @@ jobs:
           cd bibtex_to_apa
           node bibtex_to_apa.js -o '../content/glossary/apa_lookup.json'
 
+      #========================================
+      # Regenerate the glossary reference list page from apa_lookup.json so the
+      # public "List of References" stays in sync with the dynamic glossary.
+      #========================================
+      - name: Build glossary reference list
+        continue-on-error: true  # Continue even if this step fails
+        run: python3 content/glossary/_build_references.py
+
       #========================================
       # Process and generate glossary files
       #========================================

@@ -41,11 +41,21 @@ function bibtexToApaJson(bibtexContent, includeUrl = true) {
 
   for (const entry of cite.data) {
     const key = entry.id || entry['citation-key'];
-    let ref = new Cite(entry).format('bibliography', {
-      format: 'text',
+    // Render as HTML so journal titles / volumes keep their APA italics, then peel
+    // off citation-js's wrapping <div class="csl-bib-body"><div class="csl-entry">…
+    // so each value is a clean inline HTML fragment.
+    const html = new Cite(entry).format('bibliography', {
+      format: 'html',
       template: 'apa',
       lang: 'en-US'
-    }).trim();
+    });
+    const m = html.match(/class="csl-entry"[^>]*>([\s\S]*?)<\/div>/);
+    let ref = (m ? m[1] : html).trim();
+    // citation-js escapes every "&" as &#38; (in author lists AND inside URLs).
+    // Decode to a literal "&" so it survives Hugo's markdownify cleanly: authors
+    // re-escape to &amp;, and URLs with query strings (e.g. ...&oldid=) link
+    // correctly instead of double-escaping to &amp;#38;.
+    ref = ref.replace(/&#38;/g, '&');
 
     if (includeUrl) {
       const url = extractUrl(entry);

@@ -0,0 +1,74 @@
+"""Generate the glossary reference list page from apa_lookup.json.
+
+The dynamic glossary resolves per-term references from apa_lookup.json (built by
+bibtex_to_apa/bibtex_to_apa.js from the "Glossary BibTex" Google Doc). This script
+renders the same data as the public "List of References" page so the two stay in
+sync. Run it after apa_lookup.json is regenerated:
+
+    python3 content/glossary/_build_references.py
+"""
+
+import json
+import os
+import re
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+apa_path = os.path.join(script_dir, "apa_lookup.json")
+out_path = os.path.join(script_dir, "references", "index.md")
+
+URL_RE = re.compile(r"(https?://[^\s<>]+[^\s<>.,;)])")
+
+HEADER = """---
+title: List of References
+---
+
+You can find the list of all references that were used to create the Glossary.
+
+{{< alert info >}}
+
+We are currently working on a better way to display and cross-link the references with the terms they are used for.
+
+{{< /alert >}}
+
+<div class="csl-bib-body" style="line-height: 2; margin-left: 2em; text-indent:-2em;">
+"""
+
+
+def sort_key(citation):
+    # Sort like a bibliography: by the leading author/title text, ignoring
+    # leading markup/punctuation and case.
+    return re.sub(r"^[^\w]+", "", citation).casefold()
+
+
+def render(citation):
+    """Linkify the bare DOI/URL in an apa_lookup HTML fragment.
+
+    Values are HTML fragments from citation-js (journal titles in <i>/<sub>, literal
+    "&", the odd &#60;/&#62; entity). They are emitted into a Markdown page rendered
+    with goldmark unsafe=true, which escapes bare "&" to &amp; itself — so we only
+    wrap the DOI/URL here (its literal "&" gets escaped in the href too).
+    """
+    return URL_RE.sub(lambda m: f'<a href="{m.group(1)}">{m.group(1)}</a>', citation)
+
+
+def main():
+    with open(apa_path, encoding="utf-8") as f:
+        apa = json.load(f)
+
+    # Dedupe identical citation strings (distinct keys can share a reference).
+    citations = sorted(set(apa.values()), key=sort_key)
+
+    lines = [HEADER]
+    for citation in citations:
+        lines.append(f'  <div class="csl-entry">{render(citation)}</div>')
+    lines.append("</div>\n")
+
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+    print(f"Wrote {len(citations)} references to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -47,12 +47,16 @@
     print("Warning: apa_lookup.json not found. References will not be formatted.")
     apa_lookup = {}
 
-def process_references(references_text, apa_lookup, missing_refs_log=None):
+def process_references(references_text, apa_lookup, missing_refs_log=None, context=""):
     """Convert citation keys to APA format using the lookup"""
     if not references_text:
         return []
 
-    citation_pattern = r'\\?\[@([^\\]+)\\?\]'
+    # Capture the key lazily up to the closing (optionally backslash-escaped) bracket,
+    # so keys with escaped characters survive — notably escaped underscores in pandoc
+    # citekeys (e.g. \[@R\_Core\_Team2020\]), which the old [^\\]+ class truncated at the
+    # first backslash. The backslashes are unescaped out of each captured key below.
+    citation_pattern = r'\\?\[@(.+?)\\?\]'
     matches = re.findall(citation_pattern, references_text)
 
     formatted_refs = []
@@ -61,6 +65,9 @@ def process_references(references_text, apa_lookup, missing_refs_log=None):
         key = match.strip()
         original_key = key  # Keep for logging
 
+        # Unescape backslash escapes (e.g. \_ -> _) so the key matches apa_lookup
+        key = re.sub(r'\\(.)', r'\1', key)
+
         # Remove markdown formatting
         key = re.sub(r'^\*+|\*+$', '', key)  # Remove leading/trailing asterisks
         key = re.sub(r'^_+|_+$', '', key)    # Remove leading/trailing underscores
@@ -84,7 +91,37 @@ def process_references(references_text, apa_lookup, missing_refs_log=None):
                 if missing_refs_log is not None:
                     missing_refs_log.add(original_key)
                 print(f"Warning: Missing reference key '{original_key}' (cleaned: '{key}') - skipping")
-
+
+    # A bare URL (or markdown link) is a valid reference type — keep any that remain
+    # in the residual rather than dropping them, unless the URL is already part of a
+    # resolved citation. Bare URLs are wrapped as markdown links so Hugo renders them.
+    residual = re.sub(citation_pattern, '', references_text)
+    md_links = re.findall(r'\[[^\]]*\]\(https?://[^)]+\)', residual)
+    leftover = residual
+    for link in md_links:
+        leftover = leftover.replace(link, ' ', 1)
+    bare_urls = re.findall(r'(?:https?://|www\.)[^\s,;)\]]+', leftover)
+
+    def _url_of(link):
+        m = re.search(r'\((https?://[^)]+)\)', link)
+        return (m.group(1) if m else link).rstrip('.,;)')
+
+    for link in md_links:
+        if not any(_url_of(link) in ref for ref in formatted_refs):
+            formatted_refs.append(link)
+    for url in bare_urls:
+        url = url.rstrip('.,;)')
+        if not any(url in ref for ref in formatted_refs):
+            formatted_refs.append(f'[{url}]({url})')
+
+    # Whatever is left once citekeys and URLs are removed is genuine free-text that the
+    # pipeline can't resolve — surface it so it can be turned into a citekey.
+    for url in bare_urls:
+        leftover = leftover.replace(url, ' ', 1)
+    if re.search(r'[A-Za-z0-9@]', leftover):
+        where = f" in {context}" if context else ""
+        print(f"Warning: unresolved free-text reference{where} (needs a citekey): {leftover.strip(' ,;.')!r}")
+
     return list(dict.fromkeys(formatted_refs))
 
 def fix_bare_urls_in_parens(text):
@@ -197,9 +234,12 @@ def clean_filename(title, max_length=200):
             else:
                 title = en_title
 
-        # Process references
-        raw_references = safe_get(row, "Reference")
-        processed_references = process_references(raw_references, apa_lookup, missing_refs)
+        # Process references: always combine the generic shared column with any
+        # language-specific column (e.g. AR_refs, CN_refs), generic first. Both use
+        # the same [@citekey] format resolved via apa_lookup; dedupe across them.
+        processed_references = process_references(safe_get(row, "Reference"), apa_lookup, missing_refs, context=f"{title} (Reference)")
+        lang_refs = process_references(safe_get(row, f"{language_code}_refs"), apa_lookup, missing_refs, context=f"{title} ({language_code}_refs)")
+        processed_references = list(dict.fromkeys(processed_references + lang_refs))
 
         # Build entry
         definition = safe_get(row, f"{language_code}_definition" if language_code == "EN" else f"{language_code}_def")