@@ -72,16 +72,13 @@ def remove_textual_duplicates_from_different_sources(df, dupind):
7272 if len (idx ) > 1 :
7373 tmp = df .loc [idx ]
7474 df .loc [tmp .index , "is_duplicate" ] = True
75- df .loc [tmp .index , "is_latest " ] = False
75+ df .loc [tmp .index , "is_anchor " ] = False
7676 publisher_dois = list (filter (None , tmp .publisher_doi .unique ().tolist ()))
7777 if len (publisher_dois ) > 0 :
7878 # keep entry with doi
79- df .loc [idx , "keep" ] = False
80- df .loc [tmp [tmp .publisher_doi != "" ].index , "is_latest" ] = True
81- df .loc [tmp [tmp .publisher_doi != "" ].index , "keep" ] = True
79+ df .loc [tmp [tmp .publisher_doi != "" ].index , "is_anchor" ] = True
8280 else :
83- df .loc [tmp .sort_values (["doi" , "year" ], ascending = [False , False ]).head (1 ).index , "is_latest" ] = True
84- df .loc [tmp .sort_values (["doi" , "year" ], ascending = [False , False ]).head (1 ).index , "keep" ] = True
81+ df .loc [tmp .sort_values (["doi" , "year" ], ascending = [False , False ]).head (1 ).index , "is_anchor" ] = True
8582 return df
8683
8784def mark_latest_doi (df , dupind ):
@@ -91,29 +88,24 @@ def mark_latest_doi(df, dupind):
9188 for udoi in list (filter (None , tmp .unversioned_doi .unique ().tolist ())):
9289 tmp2 = tmp [tmp .unversioned_doi == udoi ]
9390 if len (tmp2 ) > 0 :
94- df .loc [tmp2 .index , "is_latest" ] = False
95- df .loc [tmp2 .index , "keep" ] = False
91+ df .loc [tmp2 .index , "is_anchor" ] = False
9692 versions = tmp2 .id
9793 latest = tmp2 .sort_values ("doi_version" , ascending = False ).head (1 ).id
9894 v = [{"versions" : versions .values .tolist (), "latest" : latest .values .tolist ()}]* len (tmp2 )
9995 df .loc [versions .index , "versions" ] = v
100- df .loc [latest .index , "is_latest" ] = True
101- df .loc [latest .index , "keep" ] = True
96+ df .loc [latest .index , "is_anchor" ] = True
10297 return df
10398
10499def prioritize_OA_and_latest (df , dupind ):
105100 for _ , idx in dupind .items ():
106101 idx = df .index .intersection (idx )
107102 if len (idx ) > 1 :
108103 tmp = df .loc [idx ]
109- df .loc [idx , "keep" ] = False
110- df .loc [idx , "is_latest" ] = False
104+ df .loc [idx , "is_anchor" ] = False
111105 if len (tmp [tmp .oa_state == "1" ]) > 0 :
112- df .loc [tmp [tmp .oa_state == "1" ].sort_values ("year" , ascending = False ).head (1 ).index , "keep" ] = True
113- df .loc [tmp [tmp .oa_state == "1" ].sort_values ("year" , ascending = False ).head (1 ).index , "is_latest" ] = True
106+ df .loc [tmp [tmp .oa_state == "1" ].sort_values ("year" , ascending = False ).head (1 ).index , "is_anchor" ] = True
114107 else :
115- df .loc [tmp .sort_values ("year" , ascending = False ).head (1 ).index , "keep" ] = True
116- df .loc [tmp .sort_values ("year" , ascending = False ).head (1 ).index , "is_latest" ] = True
108+ df .loc [tmp .sort_values ("year" , ascending = False ).head (1 ).index , "is_anchor" ] = True
117109 return df
118110
119111def mark_duplicates (metadata ):
0 commit comments