Skip to content
Open
  •  
  •  
  •  
8 changes: 8 additions & 0 deletions .github/workflows/data-processing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,14 @@ jobs:
cd bibtex_to_apa
node bibtex_to_apa.js -o '../content/glossary/apa_lookup.json'

#========================================
# Regenerate the glossary reference list page from apa_lookup.json so the
# public "List of References" stays in sync with the dynamic glossary.
#========================================
- name: Build glossary reference list
continue-on-error: true # Continue even if this step fails
run: python3 content/glossary/_build_references.py

#========================================
# Process and generate glossary files
#========================================
Expand Down
16 changes: 13 additions & 3 deletions bibtex_to_apa/bibtex_to_apa.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,21 @@ function bibtexToApaJson(bibtexContent, includeUrl = true) {

for (const entry of cite.data) {
const key = entry.id || entry['citation-key'];
let ref = new Cite(entry).format('bibliography', {
format: 'text',
// Render as HTML so journal titles / volumes keep their APA italics, then peel
// off citation-js's wrapping <div class="csl-bib-body"><div class="csl-entry">…
// so each value is a clean inline HTML fragment.
const html = new Cite(entry).format('bibliography', {
format: 'html',
template: 'apa',
lang: 'en-US'
}).trim();
});
const m = html.match(/class="csl-entry"[^>]*>([\s\S]*?)<\/div>/);
let ref = (m ? m[1] : html).trim();
// citation-js escapes every "&" as &#38; (in author lists AND inside URLs).
// Decode to a literal "&" so it survives Hugo's markdownify cleanly: authors
// re-escape to &amp;, and URLs with query strings (e.g. ...&oldid=) link
// correctly instead of double-escaping to &amp;#38;.
ref = ref.replace(/&#38;/g, '&');

if (includeUrl) {
const url = extractUrl(entry);
Expand Down
74 changes: 74 additions & 0 deletions content/glossary/_build_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Generate the glossary reference list page from apa_lookup.json.

The dynamic glossary resolves per-term references from apa_lookup.json (built by
bibtex_to_apa/bibtex_to_apa.js from the "Glossary BibTex" Google Doc). This script
renders the same data as the public "List of References" page so the two stay in
sync. Run it after apa_lookup.json is regenerated:

python3 content/glossary/_build_references.py
"""

import json
import os
import re

script_dir = os.path.dirname(os.path.abspath(__file__))
apa_path = os.path.join(script_dir, "apa_lookup.json")
out_path = os.path.join(script_dir, "references", "index.md")

URL_RE = re.compile(r"(https?://[^\s<>]+[^\s<>.,;)])")

HEADER = """---
title: List of References
---

You can find the list of all references that were used to create the Glossary.

{{< alert info >}}

We are currently working on a better way to display and cross-link the references with the terms they are used for.

{{< /alert >}}

<div class="csl-bib-body" style="line-height: 2; margin-left: 2em; text-indent:-2em;">
"""


def sort_key(citation):
# Sort like a bibliography: by the leading author/title text, ignoring
# leading markup/punctuation and case.
return re.sub(r"^[^\w]+", "", citation).casefold()


def render(citation):
"""Linkify the bare DOI/URL in an apa_lookup HTML fragment.

Values are HTML fragments from citation-js (journal titles in <i>/<sub>, literal
"&", the odd &#60;/&#62; entity). They are emitted into a Markdown page rendered
with goldmark unsafe=true, which escapes bare "&" to &amp; itself — so we only
wrap the DOI/URL here (its literal "&" gets escaped in the href too).
"""
return URL_RE.sub(lambda m: f'<a href="{m.group(1)}">{m.group(1)}</a>', citation)


def main():
with open(apa_path, encoding="utf-8") as f:
apa = json.load(f)

# Dedupe identical citation strings (distinct keys can share a reference).
citations = sorted(set(apa.values()), key=sort_key)

lines = [HEADER]
for citation in citations:
lines.append(f' <div class="csl-entry">{render(citation)}</div>')
lines.append("</div>\n")

os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines))

print(f"Wrote {len(citations)} references to {out_path}")


if __name__ == "__main__":
main()
52 changes: 46 additions & 6 deletions content/glossary/_create_glossaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,16 @@
print("Warning: apa_lookup.json not found. References will not be formatted.")
apa_lookup = {}

def process_references(references_text, apa_lookup, missing_refs_log=None):
def process_references(references_text, apa_lookup, missing_refs_log=None, context=""):
"""Convert citation keys to APA format using the lookup"""
if not references_text:
return []

citation_pattern = r'\\?\[@([^\\]+)\\?\]'
# Capture the key lazily up to the closing (optionally backslash-escaped) bracket,
# so keys with escaped characters survive — notably escaped underscores in pandoc
# citekeys (e.g. \[@R\_Core\_Team2020\]), which the old [^\\]+ class truncated at the
# first backslash. The backslashes are unescaped out of each captured key below.
citation_pattern = r'\\?\[@(.+?)\\?\]'
matches = re.findall(citation_pattern, references_text)

formatted_refs = []
Expand All @@ -61,6 +65,9 @@ def process_references(references_text, apa_lookup, missing_refs_log=None):
key = match.strip()
original_key = key # Keep for logging

# Unescape backslash escapes (e.g. \_ -> _) so the key matches apa_lookup
key = re.sub(r'\\(.)', r'\1', key)

# Remove markdown formatting
key = re.sub(r'^\*+|\*+$', '', key) # Remove leading/trailing asterisks
key = re.sub(r'^_+|_+$', '', key) # Remove leading/trailing underscores
Expand All @@ -84,7 +91,37 @@ def process_references(references_text, apa_lookup, missing_refs_log=None):
if missing_refs_log is not None:
missing_refs_log.add(original_key)
print(f"Warning: Missing reference key '{original_key}' (cleaned: '{key}') - skipping")


# A bare URL (or markdown link) is a valid reference type — keep any that remain
# in the residual rather than dropping them, unless the URL is already part of a
# resolved citation. Bare URLs are wrapped as markdown links so Hugo renders them.
residual = re.sub(citation_pattern, '', references_text)
md_links = re.findall(r'\[[^\]]*\]\(https?://[^)]+\)', residual)
leftover = residual
for link in md_links:
leftover = leftover.replace(link, ' ', 1)
bare_urls = re.findall(r'(?:https?://|www\.)[^\s,;)\]]+', leftover)

def _url_of(link):
m = re.search(r'\((https?://[^)]+)\)', link)
return (m.group(1) if m else link).rstrip('.,;)')

for link in md_links:
if not any(_url_of(link) in ref for ref in formatted_refs):
formatted_refs.append(link)
for url in bare_urls:
url = url.rstrip('.,;)')
if not any(url in ref for ref in formatted_refs):
formatted_refs.append(f'[{url}]({url})')

# Whatever is left once citekeys and URLs are removed is genuine free-text that the
# pipeline can't resolve — surface it so it can be turned into a citekey.
for url in bare_urls:
leftover = leftover.replace(url, ' ', 1)
if re.search(r'[A-Za-z0-9@]', leftover):
where = f" in {context}" if context else ""
print(f"Warning: unresolved free-text reference{where} (needs a citekey): {leftover.strip(' ,;.')!r}")

return list(dict.fromkeys(formatted_refs))

def fix_bare_urls_in_parens(text):
Expand Down Expand Up @@ -197,9 +234,12 @@ def clean_filename(title, max_length=200):
else:
title = en_title

# Process references
raw_references = safe_get(row, "Reference")
processed_references = process_references(raw_references, apa_lookup, missing_refs)
# Process references: always combine the generic shared column with any
# language-specific column (e.g. AR_refs, CN_refs), generic first. Both use
# the same [@citekey] format resolved via apa_lookup; dedupe across them.
processed_references = process_references(safe_get(row, "Reference"), apa_lookup, missing_refs, context=f"{title} (Reference)")
lang_refs = process_references(safe_get(row, f"{language_code}_refs"), apa_lookup, missing_refs, context=f"{title} ({language_code}_refs)")
processed_references = list(dict.fromkeys(processed_references + lang_refs))

# Build entry
definition = safe_get(row, f"{language_code}_definition" if language_code == "EN" else f"{language_code}_def")
Expand Down
Loading
Loading