66
77KEYWORD_SIMILARITY_THRESHOLD = 85
88
9+ OA_STATE_PRIORITY = {
10+ "1" : 0 , # yes
11+ "0" : 1 , # no
12+ "2" : 2 , # unknown
13+ }
14+
915def enrich_anchor_using_duplicates (df , dupind , subject_strategy = STRATEGY_MERGE ):
1016 """
1117 Enriches anchor elements using data from duplicates in their groups.
@@ -18,6 +24,7 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
1824 - subject_orig: processed according to subject_strategy
1925 - subject: processed according to subject_strategy
2026 - paper_abstract: replaced with the longest description
27+ - oa_state: replaced with the highest priority status (yes > no > unknown)
2128
2229 Args:
2330 df: DataFrame with metadata, containing the column is_anchor
@@ -32,8 +39,9 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
3239 has_subject_orig = 'subject_orig' in df .columns
3340 has_subject = 'subject' in df .columns
3441 has_paper_abstract = 'paper_abstract' in df .columns
42+ has_oa_state = 'oa_state' in df .columns
3543
36- is_all_columns_are_missing = not has_subject_orig and not has_subject and not has_paper_abstract
44+ is_all_columns_are_missing = not has_subject_orig and not has_subject and not has_paper_abstract and not has_oa_state
3745 if is_all_columns_are_missing :
3846 return df
3947
@@ -69,6 +77,9 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
6977 best_paper_abstract = None
7078 best_paper_abstract_length = 0
7179
80+ best_oa_state = None
81+ best_oa_state_priority = float ('inf' )
82+
7283 for element_idx in idx :
7384 if has_subject_orig :
7485 subject_orig_value = group_data .loc [element_idx , 'subject_orig' ]
@@ -105,6 +116,16 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
105116 best_paper_abstract_length = abstract_length
106117 best_paper_abstract = paper_abstract_value
107118
119+ if has_oa_state :
120+ oa_state_value = group_data .loc [element_idx , 'oa_state' ]
121+ is_not_empty = not pd .isna (oa_state_value )
122+ if is_not_empty :
123+ oa_state_str = str (oa_state_value )
124+ priority = OA_STATE_PRIORITY .get (oa_state_str , float ('inf' ))
125+ if priority < best_oa_state_priority :
126+ best_oa_state_priority = priority
127+ best_oa_state = oa_state_value
128+
108129 if has_subject_orig :
109130 if is_use_merge_strategy :
110131 if all_subject_orig_keywords :
@@ -137,6 +158,12 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
137158 if pd .isna (current ) or str (current ) != str (best_paper_abstract ):
138159 df .loc [anchor_idx , 'paper_abstract' ] = best_paper_abstract
139160
161+ is_better_oa_state_presented = best_oa_state is not None
162+ if is_better_oa_state_presented :
163+ current = df .loc [anchor_idx , 'oa_state' ]
164+ if pd .isna (current ) or str (current ) != str (best_oa_state ):
165+ df .loc [anchor_idx , 'oa_state' ] = best_oa_state
166+
140167 return df
141168
142169def deduplicate_keywords (keywords , similarity_threshold = KEYWORD_SIMILARITY_THRESHOLD ):
0 commit comments