Skip to content

Commit 8fb46f0

Browse files
committed
feat: finding best OA status
1 parent f4ae820 commit 8fb46f0

1 file changed

Lines changed: 28 additions & 1 deletion

File tree

server/workers/common/common/enrichment.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66

77
KEYWORD_SIMILARITY_THRESHOLD = 85
88

9+
OA_STATE_PRIORITY = {
10+
"1": 0, # yes
11+
"0": 1, # no
12+
"2": 2, # unknown
13+
}
14+
915
def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
1016
"""
1117
Enriches anchor elements using data from duplicates in their groups.
@@ -18,6 +24,7 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
1824
- subject_orig: processed according to subject_strategy
1925
- subject: processed according to subject_strategy
2026
- paper_abstract: replaced with the longest description
27+
- oa_state: replaced with the highest priority status (yes > no > unknown)
2128
2229
Args:
2330
df: DataFrame with metadata, containing the column is_anchor
@@ -32,8 +39,9 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
3239
has_subject_orig = 'subject_orig' in df.columns
3340
has_subject = 'subject' in df.columns
3441
has_paper_abstract = 'paper_abstract' in df.columns
42+
has_oa_state = 'oa_state' in df.columns
3543

36-
is_all_columns_are_missing = not has_subject_orig and not has_subject and not has_paper_abstract
44+
is_all_columns_are_missing = not has_subject_orig and not has_subject and not has_paper_abstract and not has_oa_state
3745
if is_all_columns_are_missing:
3846
return df
3947

@@ -69,6 +77,9 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
6977
best_paper_abstract = None
7078
best_paper_abstract_length = 0
7179

80+
best_oa_state = None
81+
best_oa_state_priority = float('inf')
82+
7283
for element_idx in idx:
7384
if has_subject_orig:
7485
subject_orig_value = group_data.loc[element_idx, 'subject_orig']
@@ -105,6 +116,16 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
105116
best_paper_abstract_length = abstract_length
106117
best_paper_abstract = paper_abstract_value
107118

119+
if has_oa_state:
120+
oa_state_value = group_data.loc[element_idx, 'oa_state']
121+
is_not_empty = not pd.isna(oa_state_value)
122+
if is_not_empty:
123+
oa_state_str = str(oa_state_value)
124+
priority = OA_STATE_PRIORITY.get(oa_state_str, float('inf'))
125+
if priority < best_oa_state_priority:
126+
best_oa_state_priority = priority
127+
best_oa_state = oa_state_value
128+
108129
if has_subject_orig:
109130
if is_use_merge_strategy:
110131
if all_subject_orig_keywords:
@@ -137,6 +158,12 @@ def enrich_anchor_using_duplicates(df, dupind, subject_strategy=STRATEGY_MERGE):
137158
if pd.isna(current) or str(current) != str(best_paper_abstract):
138159
df.loc[anchor_idx, 'paper_abstract'] = best_paper_abstract
139160

161+
is_better_oa_state_presented = best_oa_state is not None
162+
if is_better_oa_state_presented:
163+
current = df.loc[anchor_idx, 'oa_state']
164+
if pd.isna(current) or str(current) != str(best_oa_state):
165+
df.loc[anchor_idx, 'oa_state'] = best_oa_state
166+
140167
return df
141168

142169
def deduplicate_keywords(keywords, similarity_threshold=KEYWORD_SIMILARITY_THRESHOLD):

0 commit comments

Comments
 (0)