|
26 | 26 | from tqdm import tqdm |
27 | 27 |
|
28 | 28 | from scholarly import scholarly |
29 | | - |
30 | | - |
31 | 29 | import bibtexparser |
32 | 30 | from bibtexparser.bwriter import BibTexWriter |
| 31 | + |
| 32 | + |
33 | 33 | # ----------------------------- |
34 | 34 | # Config |
35 | 35 | # ----------------------------- |
|
66 | 66 | def now_year() -> int: |
67 | 67 | return datetime.now(timezone.utc).year |
68 | 68 |
|
| 69 | +def clean_crossref_text(s: str) -> str: |
| 70 | + """ |
| 71 | + Convert HTML entities (< etc.) to characters, strip tags (<p>), |
| 72 | + and normalize whitespace. |
| 73 | + """ |
| 74 | + if not s: |
| 75 | + return "" |
| 76 | + s = html.unescape(s) # <p> -> <p> |
| 77 | + s = re.sub(r"<[^>]+>", " ", s) # remove tags like <p> |
| 78 | + return normalize_ws(s) |
| 79 | + |
69 | 80 |
|
70 | 81 | def normalize_ws(s: str) -> str: |
71 | 82 | return re.sub(r"\s+", " ", (s or "")).strip() |
@@ -111,6 +122,7 @@ def token_set_ratio(a: str, b: str) -> float: |
111 | 122 | def strip_html_tags(s: str) -> str: |
112 | 123 | if not s: |
113 | 124 | return "" |
| 125 | + # Crossref abstracts are sometimes JATS-ish; strip tags crudely. |
114 | 126 | s = re.sub(r"<[^>]+>", " ", s) |
115 | 127 | s = html.unescape(s) |
116 | 128 | return normalize_ws(s) |
@@ -193,7 +205,6 @@ def parse_first_bibtex_entry(bibtex_str: str) -> dict: |
193 | 205 | out[kk.lower()] = vv # keep everything else |
194 | 206 | return out |
195 | 207 |
|
196 | | - |
197 | 208 | def prefer_doi_key(entry: dict) -> None: |
198 | 209 | """If doi exists, use it as BibTeX key (ID) like @...{10.1145/...,...}.""" |
199 | 210 | doi = (entry.get("doi") or "").strip() |
@@ -319,15 +330,15 @@ def springer_bibtex_by_doi(doi: str) -> str: |
319 | 330 | def get_bibtex_with_fallback(p_full: dict, title: str) -> str: |
320 | 331 | # 1) Try directly |
321 | 332 | try: |
322 | | - s = scholarly.bibtex(p_full) # Get bibtext directly |
| 333 | + s = scholarly.bibtex(p_full) |
323 | 334 | if s: |
324 | 335 | return s |
325 | 336 | except Exception: |
326 | 337 | pass |
327 | 338 |
|
328 | 339 | # 2) Fallback: search by title |
329 | 340 | try: |
330 | | - q = scholarly.search_pubs(title) # Search by title |
| 341 | + q = scholarly.search_pubs(title) |
331 | 342 | pub2 = next(q, None) |
332 | 343 | if not pub2: |
333 | 344 | return "" |
@@ -465,6 +476,10 @@ def build_entry_keep_all_fields( |
465 | 476 | "url": link_fallback, |
466 | 477 | }, overwrite=False) |
467 | 478 |
|
| 479 | + # Killing html elements |
| 480 | + if entry.get("title"): |
| 481 | + entry["title"] = clean_crossref_text(entry["title"]) |
| 482 | + |
468 | 483 | # Venue: only patch if missing in BOTH journal/booktitle |
469 | 484 | if not (entry.get("journal") or entry.get("booktitle")): |
470 | 485 | # choose booktitle for inproceedings, otherwise journal |
@@ -720,6 +735,8 @@ def main(): |
720 | 735 | if y_int not in allowed_years: |
721 | 736 | break |
722 | 737 |
|
| 738 | + print(f"Doing for {idx}") |
| 739 | + |
723 | 740 | authors = pick_authors(p_full) |
724 | 741 | venue = pick_venue(p_full) |
725 | 742 | link = pick_link(p_full) |
@@ -853,6 +870,7 @@ def main(): |
853 | 870 | best_doi = (best.get("DOI") or "").strip() |
854 | 871 | if best_doi: |
855 | 872 | crossref_bib = crossref_bibtex_transform(best_doi) |
| 873 | + # also fetch message if you want abstract etc. |
856 | 874 | try: |
857 | 875 | crossref_msg = crossref_lookup_by_doi(best_doi) |
858 | 876 | except Exception: |
|
0 commit comments