4545
4646
4747
48- def extract_doi (text : str ) -> str :
49- if not text :
50- return ""
51- m = DOI_RE .search (text )
52- return m .group (1 ) if m else ""
48+ from typing import List
49+ def extract_doi (list_text : List [str ]) -> str :
50+
51+ for text in list_text :
52+ if not text :
53+ continue
54+ m = DOI_RE .search (text )
55+
56+ if m :
57+ return m .group (1 )
58+
59+ return ""
5360
5461
5562def make_bib_key (authors_bibtex : str , year : str , title : str ) -> str :
@@ -139,48 +146,67 @@ def bibtex_to_fields(bibtex_str: str) -> dict:
139146 out [str (k ).lower ().strip ()] = str (v ).strip ()
140147 return out
141148
142-
143-
144- def pick_title (pub : dict ) -> str :
149+ def pick_basics (pub : dict ) -> str :
145150 bib = pub .get ("bib" , {}) or {}
146151 title = normalize_ws (bib .get ("title" ) or "" )
147- author = normalize_ws (bib .get ("author" ) or "" )
152+
153+ authors = normalize_ws (bib .get ("author" ) or "" )
154+
148155 year = bib .get ("pub_year" ) or bib .get ("year" ) or ""
149- year = str (y ).strip () if y is not None else ""
150- return normalize_ws (bib .get ("title" ) or "" )
156+ year = str (year ).strip () if year is not None else ""
151157
158+ link = normalize_ws (pub .get ("pub_url" ) or bib .get ("url" ) or "" )
152159
153- def pick_authors (pub : dict ) -> str :
154160 bib = pub .get ("bib" , {}) or {}
155- return normalize_ws (bib .get ("author " ) or "" )
161+ cit = normalize_ws (bib .get ("citation " ) or "" )
156162
163+ if cit :
164+ head = cit .split ("," , 1 )[0 ].strip ()
165+ if head and head .lower () != "unknown" :
166+ venue = head
157167
158- def pick_year (pub : dict ) -> str :
159- bib = pub .get ("bib" , {}) or {}
160- y = bib .get ("pub_year" ) or bib .get ("year" ) or ""
161- return str (y ).strip () if y is not None else ""
168+ else :
169+ venue = re .sub (r"\s*\(?\b(19|20)\d{2}\b\)?\s*$" , "" , cit ).strip ()
162170
171+ else : venue = cit
163172
164- def pick_venue (pub : dict ) -> str :
165- """
166- Your previous venue parsing relied on bib['citation'].
167- Keep that fallback approach because it tends to exist.
168- """
169- bib = pub .get ("bib" , {}) or {}
170- cit = normalize_ws (bib .get ("citation" ) or "" )
171- if not cit :
172- return ""
173- head = cit .split ("," , 1 )[0 ].strip ()
174- if head and head .lower () != "unknown" :
175- return head
176- # strip trailing year
177- cit2 = re .sub (r"\s*\(?\b(19|20)\d{2}\b\)?\s*$" , "" , cit ).strip ()
178- return cit2
173+ return title , authors , venue , year , link
179174
175+ # def pick_title(pub: dict) -> str:
176+ # bib = pub.get("bib", {}) or {}
177+ # title = normalize_ws(bib.get("title") or "")
178+ # # author = normalize_ws(bib.get("author") or "")
179+ # # year = bib.get("pub_year") or bib.get("year") or ""
180+ # # year = str(y).strip() if y is not None else ""
181+ # return title
182+
183+
184+ # def pick_authors(pub: dict) -> str:
185+ # bib = pub.get("bib", {}) or {}
186+ # return normalize_ws(bib.get("author") or "")
187+
188+
189+ # def pick_year(pub: dict) -> str:
190+ # bib = pub.get("bib", {}) or {}
191+ # y = bib.get("pub_year") or bib.get("year") or ""
192+ # return str(y).strip() if y is not None else ""
193+
194+
195+ # def pick_venue(pub: dict) -> str:
196+
197+ # bib = pub.get("bib", {}) or {}
198+ # cit = normalize_ws(bib.get("citation") or "")
199+
200+ # if cit:
201+ # head = cit.split(",", 1)[0].strip()
202+ # if head and head.lower() != "unknown":
203+ # cit = head
204+
205+ # else:
206+ # cit = re.sub(r"\s*\(?\b(19|20)\d{2}\b\)?\s*$", "", cit).strip()
207+
208+ # return citc
180209
181- def pick_link (pub : dict ) -> str :
182- bib = pub .get ("bib" , {}) or {}
183- return normalize_ws (pub .get ("pub_url" ) or bib .get ("url" ) or "" )
184210
185211# Springer
186212def springer_bibtex_by_doi (doi : str ) -> str :
@@ -504,14 +530,11 @@ def crossref_search_best(title: str, year: str, venue: str, rows: int = 5) -> di
504530
505531 return best_item or {}
506532
507- def acm_dl_bibtex_by_doi (doi : str ) -> str :
533+ def acm_bibtex_by_doi (doi : str ) -> str :
508534 """
509535 Best-effort attempt to download BibTeX from ACM DL for 10.1145/* DOIs.
510536 This endpoint sometimes changes / may require access; keep it best-effort.
511537 """
512- if not doi .startswith ("10.1145/" ):
513- return ""
514- # Common ACM export endpoint pattern:
515538 url = "https://dl.acm.org/action/downloadCitation"
516539 params = {"doi" : doi , "format" : "bibtex" }
517540 try :
@@ -613,24 +636,23 @@ def main(scholar_id:str,
613636 except Exception :
614637 p_full = p
615638
616- title = pick_title (p_full )
639+ title , authors , venue , year , link = pick_basics (p_full )
640+
617641 if not title :
642+ # Too little information to actually do something
618643 continue
619644
620- year = pick_year (p_full )
621645 try :
622- y_int = int (year )
646+ year = int (year )
623647 except Exception :
624648 continue
625- if y_int not in allowed_years :
649+
650+ if year not in allowed_years :
651+ # Not within the year window
626652 break
627653
628654 print (f"Doing for { idx } " )
629655
630- authors = pick_authors (p_full )
631- venue = pick_venue (p_full )
632- link = pick_link (p_full )
633-
634656 # Pull scholar bibtex (still useful as fallback, DOI extraction, etc.)
635657 scholar_bibtex = ""
636658 scholar_fields = {}
@@ -685,47 +707,51 @@ def main(scholar_id:str,
685707 time .sleep (1.0 )
686708 continue
687709
688- # Not arXiv -> prefer Crossref (with validation), else scholarly bibtex, else minimal entry.
689- chosen_entry = {}
710+ entry = {}
690711
691712 # 1) Try Crossref via DOI if we can extract it
692- doi = (
693- extract_doi (link )
694- or extract_doi (scholar_bibtex )
695- or extract_doi (json .dumps (scholar_fields , ensure_ascii = False ))
696- )
697-
698- acm_bib = acm_dl_bibtex_by_doi (doi ) if doi else ""
699- if acm_bib :
700- chosen_entry = build_entry_keep_all_fields (
701- acm_bib ,
702- title_fallback = title ,
703- venue_fallback = venue ,
704- year_fallback = year ,
705- link_fallback = link ,
706- abstract_fallback = "" , # ACM BibTeX sometimes includes abstract; if it does, keep_all preserves it
707- )
713+ doi = extract_doi ([link , scholar_bibtex ,
714+ json .dumps (scholar_fields , ensure_ascii = False )])
715+ # doi = (
716+ # extract_doi(link)
717+ # or extract_doi(scholar_bibtex)
718+ # or extract_doi(json.dumps(scholar_fields, ensure_ascii=False))
719+ # )
720+
721+ if doi and doi .startswith ("10.1145/" ):
722+ acm_bib = acm_bibtex_by_doi (doi )
723+ if acm_bib :
724+ entry = build_entry_keep_all_fields (
725+ acm_bib ,
726+ title_fallback = title ,
727+ venue_fallback = venue ,
728+ year_fallback = year ,
729+ link_fallback = link ,
730+ abstract_fallback = "" ,
731+ )
708732
709- if not chosen_entry and doi and doi .startswith ("10.1007/" ):
733+ if not entry and doi and doi .startswith ("10.1007/" ):
710734 sp_bib = springer_bibtex_by_doi (doi )
711735 if sp_bib :
712- chosen_entry = build_entry_keep_all_fields (
736+ entry = build_entry_keep_all_fields (
713737 sp_bib ,
714738 title_fallback = title ,
715739 venue_fallback = venue ,
716740 year_fallback = year ,
717741 link_fallback = link ,
718- abstract_fallback = "" , # Springer bibtex usually won’t include abstract
742+ abstract_fallback = "" ,
719743 )
720744
721745 crossref_bib = ""
722746 crossref_msg = {}
723747
724- if not chosen_entry and doi :
748+
749+ # CROSSREF by DOI
750+ if not entry and doi :
725751 try :
726- # Validate year & venue/title using the Crossref message too (stronger)
727752 crossref_msg = crossref_lookup_by_doi (doi )
728- # Basic checks
753+
754+ # To verify crossref entry
729755 cr_title = ""
730756 if isinstance (crossref_msg .get ("title" ), list ) and crossref_msg ["title" ]:
731757 cr_title = crossref_msg ["title" ][0 ]
@@ -753,7 +779,7 @@ def main(scholar_id:str,
753779 crossref_bib = ""
754780 crossref_msg = {}
755781
756- # 2) If no DOI path worked, search Crossref by title and validate
782+ # CROSSREF by title year
757783 if not crossref_bib :
758784 try :
759785 best = crossref_search_best (title = title , year = year , venue = venue , rows = 5 )
@@ -768,48 +794,48 @@ def main(scholar_id:str,
768794 except Exception :
769795 crossref_bib = ""
770796
771- # 3) Build entry from Crossref bibtex if available
772797 if crossref_bib :
773798 cr_abs = strip_html_tags (crossref_msg .get ("abstract" ) or "" )
774- chosen_entry = build_entry_keep_all_fields (
799+ entry = build_entry_keep_all_fields (
775800 crossref_bib ,
776801 title_fallback = title ,
777802 venue_fallback = venue ,
778803 year_fallback = year ,
779804 link_fallback = link ,
780805 abstract_fallback = cr_abs ,
781806 )
782- # If Crossref JSON has abstract, optionally include it (nice-to-have)
783- cr_abs = strip_html_tags (crossref_msg .get ("abstract" ) or "" )
807+
808+ cr_abs = strip_html_tags (crossref_msg .get ("abstract" ) or "" ) # Keep abstract if available
784809 if cr_abs :
785- chosen_entry ["abstract" ] = cr_abs
810+ entry ["abstract" ] = cr_abs
786811
787- # 4) Otherwise, fall back to scholarly BibTeX -> parse into entry
788- if not chosen_entry and scholar_bibtex :
789- chosen_entry = build_entry_keep_all_fields (
812+ # Scholarly bibtex if they have that
813+ if not entry and scholar_bibtex :
814+ entry = build_entry_keep_all_fields (
790815 scholar_bibtex ,
791816 title_fallback = title ,
792817 venue_fallback = venue ,
793818 year_fallback = year ,
794819 link_fallback = link ,
795- abstract_fallback = "" , # scholar bibtex usually won't have it
820+ abstract_fallback = "" ,
796821 )
797822
798- # 5) Absolute last resort: minimal entry
799- if not chosen_entry :
800- chosen_entry = {
823+ # Minimum
824+ if not entry :
825+ authors = normalize_authors_to_bibtex (authors )
826+ entry = {
801827 "ENTRYTYPE" : "misc" ,
802- "ID" : make_bib_key (normalize_authors_to_bibtex ( authors ) , year , title ),
828+ "ID" : make_bib_key (authors , year , title ),
803829 "title" : title ,
804- "author" : normalize_authors_to_bibtex ( authors ) ,
830+ "author" : authors ,
805831 "year" : year ,
806832 }
807833 if venue :
808- chosen_entry ["howpublished" ] = venue
834+ entry ["howpublished" ] = venue
809835 if link :
810- chosen_entry ["url" ] = link
836+ entry ["url" ] = link
811837
812- entries .append (chosen_entry )
838+ entries .append (entry )
813839 time .sleep (1.0 )
814840
815841 write_to_bibtex (entries , outpath ,
0 commit comments