Skip to content

Commit 8bd6bfe

Browse files
committed
update
1 parent 84d9812 commit 8bd6bfe

File tree

1 file changed

+115
-89
lines changed

1 file changed

+115
-89
lines changed

scripts/update_pubs.py

Lines changed: 115 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,18 @@
4545

4646

4747

48-
def extract_doi(text: str) -> str:
49-
if not text:
50-
return ""
51-
m = DOI_RE.search(text)
52-
return m.group(1) if m else ""
48+
from typing import List
49+
def extract_doi(list_text: List[str]) -> str:
50+
51+
for text in list_text:
52+
if not text:
53+
continue
54+
m = DOI_RE.search(text)
55+
56+
if m:
57+
return m.group(1)
58+
59+
return ""
5360

5461

5562
def make_bib_key(authors_bibtex: str, year: str, title: str) -> str:
@@ -139,48 +146,67 @@ def bibtex_to_fields(bibtex_str: str) -> dict:
139146
out[str(k).lower().strip()] = str(v).strip()
140147
return out
141148

142-
143-
144-
def pick_title(pub: dict) -> str:
149+
def pick_basics(pub: dict) -> str:
145150
bib = pub.get("bib", {}) or {}
146151
title = normalize_ws(bib.get("title") or "")
147-
author = normalize_ws(bib.get("author") or "")
152+
153+
authors = normalize_ws(bib.get("author") or "")
154+
148155
year = bib.get("pub_year") or bib.get("year") or ""
149-
year = str(y).strip() if y is not None else ""
150-
return normalize_ws(bib.get("title") or "")
156+
year = str(year).strip() if year is not None else ""
151157

158+
link = normalize_ws(pub.get("pub_url") or bib.get("url") or "")
152159

153-
def pick_authors(pub: dict) -> str:
154160
bib = pub.get("bib", {}) or {}
155-
return normalize_ws(bib.get("author") or "")
161+
cit = normalize_ws(bib.get("citation") or "")
156162

163+
if cit:
164+
head = cit.split(",", 1)[0].strip()
165+
if head and head.lower() != "unknown":
166+
venue = head
157167

158-
def pick_year(pub: dict) -> str:
159-
bib = pub.get("bib", {}) or {}
160-
y = bib.get("pub_year") or bib.get("year") or ""
161-
return str(y).strip() if y is not None else ""
168+
else:
169+
venue = re.sub(r"\s*\(?\b(19|20)\d{2}\b\)?\s*$", "", cit).strip()
162170

171+
else: venue = cit
163172

164-
def pick_venue(pub: dict) -> str:
165-
"""
166-
Your previous venue parsing relied on bib['citation'].
167-
Keep that fallback approach because it tends to exist.
168-
"""
169-
bib = pub.get("bib", {}) or {}
170-
cit = normalize_ws(bib.get("citation") or "")
171-
if not cit:
172-
return ""
173-
head = cit.split(",", 1)[0].strip()
174-
if head and head.lower() != "unknown":
175-
return head
176-
# strip trailing year
177-
cit2 = re.sub(r"\s*\(?\b(19|20)\d{2}\b\)?\s*$", "", cit).strip()
178-
return cit2
173+
return title, authors, venue, year, link
179174

175+
# def pick_title(pub: dict) -> str:
176+
# bib = pub.get("bib", {}) or {}
177+
# title = normalize_ws(bib.get("title") or "")
178+
# # author = normalize_ws(bib.get("author") or "")
179+
# # year = bib.get("pub_year") or bib.get("year") or ""
180+
# # year = str(y).strip() if y is not None else ""
181+
# return title
182+
183+
184+
# def pick_authors(pub: dict) -> str:
185+
# bib = pub.get("bib", {}) or {}
186+
# return normalize_ws(bib.get("author") or "")
187+
188+
189+
# def pick_year(pub: dict) -> str:
190+
# bib = pub.get("bib", {}) or {}
191+
# y = bib.get("pub_year") or bib.get("year") or ""
192+
# return str(y).strip() if y is not None else ""
193+
194+
195+
# def pick_venue(pub: dict) -> str:
196+
197+
# bib = pub.get("bib", {}) or {}
198+
# cit = normalize_ws(bib.get("citation") or "")
199+
200+
# if cit:
201+
# head = cit.split(",", 1)[0].strip()
202+
# if head and head.lower() != "unknown":
203+
# cit = head
204+
205+
# else:
206+
# cit = re.sub(r"\s*\(?\b(19|20)\d{2}\b\)?\s*$", "", cit).strip()
207+
208+
# return citc
180209

181-
def pick_link(pub: dict) -> str:
182-
bib = pub.get("bib", {}) or {}
183-
return normalize_ws(pub.get("pub_url") or bib.get("url") or "")
184210

185211
# Springer
186212
def springer_bibtex_by_doi(doi: str) -> str:
@@ -504,14 +530,11 @@ def crossref_search_best(title: str, year: str, venue: str, rows: int = 5) -> di
504530

505531
return best_item or {}
506532

507-
def acm_dl_bibtex_by_doi(doi: str) -> str:
533+
def acm_bibtex_by_doi(doi: str) -> str:
508534
"""
509535
Best-effort attempt to download BibTeX from ACM DL for 10.1145/* DOIs.
510536
This endpoint sometimes changes / may require access; keep it best-effort.
511537
"""
512-
if not doi.startswith("10.1145/"):
513-
return ""
514-
# Common ACM export endpoint pattern:
515538
url = "https://dl.acm.org/action/downloadCitation"
516539
params = {"doi": doi, "format": "bibtex"}
517540
try:
@@ -613,24 +636,23 @@ def main(scholar_id:str,
613636
except Exception:
614637
p_full = p
615638

616-
title = pick_title(p_full)
639+
title, authors, venue, year, link = pick_basics(p_full)
640+
617641
if not title:
642+
# Too little information to actually do something
618643
continue
619644

620-
year = pick_year(p_full)
621645
try:
622-
y_int = int(year)
646+
year = int(year)
623647
except Exception:
624648
continue
625-
if y_int not in allowed_years:
649+
650+
if year not in allowed_years:
651+
# Not within the year window
626652
break
627653

628654
print(f"Doing for {idx}")
629655

630-
authors = pick_authors(p_full)
631-
venue = pick_venue(p_full)
632-
link = pick_link(p_full)
633-
634656
# Pull scholar bibtex (still useful as fallback, DOI extraction, etc.)
635657
scholar_bibtex = ""
636658
scholar_fields = {}
@@ -685,47 +707,51 @@ def main(scholar_id:str,
685707
time.sleep(1.0)
686708
continue
687709

688-
# Not arXiv -> prefer Crossref (with validation), else scholarly bibtex, else minimal entry.
689-
chosen_entry = {}
710+
entry = {}
690711

691712
# 1) Try Crossref via DOI if we can extract it
692-
doi = (
693-
extract_doi(link)
694-
or extract_doi(scholar_bibtex)
695-
or extract_doi(json.dumps(scholar_fields, ensure_ascii=False))
696-
)
697-
698-
acm_bib = acm_dl_bibtex_by_doi(doi) if doi else ""
699-
if acm_bib:
700-
chosen_entry = build_entry_keep_all_fields(
701-
acm_bib,
702-
title_fallback=title,
703-
venue_fallback=venue,
704-
year_fallback=year,
705-
link_fallback=link,
706-
abstract_fallback="", # ACM BibTeX sometimes includes abstract; if it does, keep_all preserves it
707-
)
713+
doi = extract_doi([link, scholar_bibtex,
714+
json.dumps(scholar_fields, ensure_ascii=False)])
715+
# doi = (
716+
# extract_doi(link)
717+
# or extract_doi(scholar_bibtex)
718+
# or extract_doi(json.dumps(scholar_fields, ensure_ascii=False))
719+
# )
720+
721+
if doi and doi.startswith("10.1145/"):
722+
acm_bib = acm_bibtex_by_doi(doi)
723+
if acm_bib:
724+
entry = build_entry_keep_all_fields(
725+
acm_bib,
726+
title_fallback=title,
727+
venue_fallback=venue,
728+
year_fallback=year,
729+
link_fallback=link,
730+
abstract_fallback="",
731+
)
708732

709-
if not chosen_entry and doi and doi.startswith("10.1007/"):
733+
if not entry and doi and doi.startswith("10.1007/"):
710734
sp_bib = springer_bibtex_by_doi(doi)
711735
if sp_bib:
712-
chosen_entry = build_entry_keep_all_fields(
736+
entry = build_entry_keep_all_fields(
713737
sp_bib,
714738
title_fallback=title,
715739
venue_fallback=venue,
716740
year_fallback=year,
717741
link_fallback=link,
718-
abstract_fallback="", # Springer bibtex usually won’t include abstract
742+
abstract_fallback="",
719743
)
720744

721745
crossref_bib = ""
722746
crossref_msg = {}
723747

724-
if not chosen_entry and doi:
748+
749+
# CROSSREF by DOI
750+
if not entry and doi:
725751
try:
726-
# Validate year & venue/title using the Crossref message too (stronger)
727752
crossref_msg = crossref_lookup_by_doi(doi)
728-
# Basic checks
753+
754+
# To verify crossref entry
729755
cr_title = ""
730756
if isinstance(crossref_msg.get("title"), list) and crossref_msg["title"]:
731757
cr_title = crossref_msg["title"][0]
@@ -753,7 +779,7 @@ def main(scholar_id:str,
753779
crossref_bib = ""
754780
crossref_msg = {}
755781

756-
# 2) If no DOI path worked, search Crossref by title and validate
782+
# CROSSREF by title year
757783
if not crossref_bib:
758784
try:
759785
best = crossref_search_best(title=title, year=year, venue=venue, rows=5)
@@ -768,48 +794,48 @@ def main(scholar_id:str,
768794
except Exception:
769795
crossref_bib = ""
770796

771-
# 3) Build entry from Crossref bibtex if available
772797
if crossref_bib:
773798
cr_abs = strip_html_tags(crossref_msg.get("abstract") or "")
774-
chosen_entry = build_entry_keep_all_fields(
799+
entry = build_entry_keep_all_fields(
775800
crossref_bib,
776801
title_fallback=title,
777802
venue_fallback=venue,
778803
year_fallback=year,
779804
link_fallback=link,
780805
abstract_fallback=cr_abs,
781806
)
782-
# If Crossref JSON has abstract, optionally include it (nice-to-have)
783-
cr_abs = strip_html_tags(crossref_msg.get("abstract") or "")
807+
808+
cr_abs = strip_html_tags(crossref_msg.get("abstract") or "") # Keep abstract if available
784809
if cr_abs:
785-
chosen_entry["abstract"] = cr_abs
810+
entry["abstract"] = cr_abs
786811

787-
# 4) Otherwise, fall back to scholarly BibTeX -> parse into entry
788-
if not chosen_entry and scholar_bibtex:
789-
chosen_entry = build_entry_keep_all_fields(
812+
# Scholarly bibtex if they have that
813+
if not entry and scholar_bibtex:
814+
entry = build_entry_keep_all_fields(
790815
scholar_bibtex,
791816
title_fallback=title,
792817
venue_fallback=venue,
793818
year_fallback=year,
794819
link_fallback=link,
795-
abstract_fallback="", # scholar bibtex usually won't have it
820+
abstract_fallback="",
796821
)
797822

798-
# 5) Absolute last resort: minimal entry
799-
if not chosen_entry:
800-
chosen_entry = {
823+
# Minimum
824+
if not entry:
825+
authors = normalize_authors_to_bibtex(authors)
826+
entry = {
801827
"ENTRYTYPE": "misc",
802-
"ID": make_bib_key(normalize_authors_to_bibtex(authors), year, title),
828+
"ID": make_bib_key(authors, year, title),
803829
"title": title,
804-
"author": normalize_authors_to_bibtex(authors),
830+
"author": authors,
805831
"year": year,
806832
}
807833
if venue:
808-
chosen_entry["howpublished"] = venue
834+
entry["howpublished"] = venue
809835
if link:
810-
chosen_entry["url"] = link
836+
entry["url"] = link
811837

812-
entries.append(chosen_entry)
838+
entries.append(entry)
813839
time.sleep(1.0)
814840

815841
write_to_bibtex(entries, outpath,

0 commit comments

Comments
 (0)