Skip to content

Commit 6fd83b8

Browse files
committed
refactor: usage of is_anchor flag instead of keep and is_latest
1 parent 1419b03 commit 6fd83b8

3 files changed

Lines changed: 14 additions & 25 deletions

File tree

server/workers/base/src/base.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,10 @@ def handle_contentproviders(self, request_id, params):
235235

236236
def filter_duplicates(df):
237237
df.drop_duplicates("id", inplace=True, keep="first")
238-
df["is_latest"] = True
238+
df["is_anchor"] = False
239239
df["doi_duplicate"] = False
240240
df["has_relations"] = False
241241
df["link_duplicate"] = False
242-
df["keep"] = False
243242
df["duplicates"] = df.apply(
244243
lambda x: ",".join([x["id"], x["duplicates"]])
245244
if len(x["duplicates"].split(",")) >= 1
@@ -266,17 +265,16 @@ def filter_duplicates(df):
266265
non_datasets = df.loc[df.index.difference(pure_datasets.index)]
267266
non_datasets = prioritize_OA_and_latest(non_datasets, dupind)
268267
pure_datasets = mark_latest_doi(pure_datasets, dupind)
269-
filtered_non_datasets = non_datasets[non_datasets.is_latest == True]
268+
filtered_non_datasets = non_datasets[non_datasets.is_anchor == True]
270269
filtered_datasets = pure_datasets[
271-
(pure_datasets.keep == True) | (pure_datasets.is_duplicate == False)
270+
(pure_datasets.is_anchor == True) | (pure_datasets.is_duplicate == False)
272271
]
273272
filtered = pd.concat([filtered_non_datasets, filtered_datasets])
274273
filtered.sort_index(inplace=True)
275274
for c in [
276275
"doi_duplicate",
277276
"link_duplicate",
278-
"is_latest",
279-
"keep",
277+
"is_anchor",
280278
"duplicates",
281279
"doi_version",
282280
"unversioned_doi",

server/workers/base/tests/unit/test_base.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,7 @@ def test_filter_duplicates():
193193
# Add extra columns that filter_duplicates is supposed to drop.
194194
df["doi_duplicate"] = False
195195
df["link_duplicate"] = False
196-
df["is_latest"] = True
197-
df["keep"] = False
196+
df["is_anchor"] = False
198197
df["doi_version"] = ["v1", "v1", "v2"]
199198
df["unversioned_doi"] = ["doi1", "doi1", "doi2"]
200199
df["publisher_doi"] = ["pub1", "pub1", "pub2"]
@@ -203,7 +202,7 @@ def test_filter_duplicates():
203202
filtered = filter_duplicates(df.copy())
204203
# Verify that the dropped columns are not present.
205204
for col in [
206-
"doi_duplicate", "link_duplicate", "is_latest", "keep",
205+
"doi_duplicate", "link_duplicate", "is_anchor",
207206
"doi_version", "unversioned_doi", "publisher_doi", "has_relations"
208207
]:
209208
assert col not in filtered.columns

server/workers/common/common/deduplication.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,13 @@ def remove_textual_duplicates_from_different_sources(df, dupind):
7272
if len(idx) > 1:
7373
tmp = df.loc[idx]
7474
df.loc[tmp.index, "is_duplicate"] = True
75-
df.loc[tmp.index, "is_latest"] = False
75+
df.loc[tmp.index, "is_anchor"] = False
7676
publisher_dois = list(filter(None, tmp.publisher_doi.unique().tolist()))
7777
if len(publisher_dois) > 0:
7878
# keep entry with doi
79-
df.loc[idx, "keep"] = False
80-
df.loc[tmp[tmp.publisher_doi!=""].index, "is_latest"] = True
81-
df.loc[tmp[tmp.publisher_doi!=""].index, "keep"] = True
79+
df.loc[tmp[tmp.publisher_doi!=""].index, "is_anchor"] = True
8280
else:
83-
df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "is_latest"] = True
84-
df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "keep"] = True
81+
df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "is_anchor"] = True
8582
return df
8683

8784
def mark_latest_doi(df, dupind):
@@ -91,29 +88,24 @@ def mark_latest_doi(df, dupind):
9188
for udoi in list(filter(None, tmp.unversioned_doi.unique().tolist())):
9289
tmp2 = tmp[tmp.unversioned_doi == udoi]
9390
if len(tmp2) > 0:
94-
df.loc[tmp2.index, "is_latest"] = False
95-
df.loc[tmp2.index, "keep"] = False
91+
df.loc[tmp2.index, "is_anchor"] = False
9692
versions = tmp2.id
9793
latest = tmp2.sort_values("doi_version", ascending=False).head(1).id
9894
v = [{"versions": versions.values.tolist(), "latest": latest.values.tolist()}]*len(tmp2)
9995
df.loc[versions.index, "versions"] = v
100-
df.loc[latest.index, "is_latest"] = True
101-
df.loc[latest.index, "keep"] = True
96+
df.loc[latest.index, "is_anchor"] = True
10297
return df
10398

10499
def prioritize_OA_and_latest(df, dupind):
105100
for _, idx in dupind.items():
106101
idx = df.index.intersection(idx)
107102
if len(idx) > 1:
108103
tmp = df.loc[idx]
109-
df.loc[idx, "keep"] = False
110-
df.loc[idx, "is_latest"] = False
104+
df.loc[idx, "is_anchor"] = False
111105
if len(tmp[tmp.oa_state=="1"]) > 0:
112-
df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "keep"] = True
113-
df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "is_latest"] = True
106+
df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "is_anchor"] = True
114107
else:
115-
df.loc[tmp.sort_values("year", ascending=False).head(1).index, "keep"] = True
116-
df.loc[tmp.sort_values("year", ascending=False).head(1).index, "is_latest"] = True
108+
df.loc[tmp.sort_values("year", ascending=False).head(1).index, "is_anchor"] = True
117109
return df
118110

119111
def mark_duplicates(metadata):

0 commit comments

Comments
 (0)