Skip to content

Commit 78ee23c

Browse files
committed
refactor: usage of doi and provider prioritization only for ORCID
1 parent 8b3d651 commit 78ee23c

2 files changed

Lines changed: 13 additions & 3 deletions

File tree

server/workers/base/src/base.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import subprocess
44
import pandas as pd
5+
import logging
56
from common.r_wrapper import RWrapper
67
from common.deduplication import (
78
find_version_in_doi,
@@ -29,6 +30,7 @@
2930
import sys
3031
from common.rate_limiter import RateLimiter
3132

33+
logger = logging.getLogger(__name__)
3234

3335
class BaseClient(RWrapper):
3436
def __init__(self, *args):
@@ -50,14 +52,18 @@ def next_item(self):
5052
message = json.loads(message.decode("utf-8"))
5153
request_id = message.get("id")
5254
params = self.add_default_params(message.get("params"))
55+
original_service = params.get("original_service")
5356
params["service"] = "base"
57+
if original_service:
58+
params["original_service"] = original_service
5459
endpoint = message.get("endpoint")
5560
self.logger.debug(f"Request ID: {request_id}, Params: {params}, Endpoint: {endpoint}")
5661
return request_id, params, endpoint
5762

5863
def execute_search(self, params):
5964
q = params.get("q")
6065
service = params.get("service")
66+
original_service = params.get("original_service", service)
6167
data = {}
6268
data["params"] = params
6369
cmd = [self.command, self.runner, self.wd, q, service]
@@ -80,7 +86,7 @@ def execute_search(self, params):
8086
else:
8187
metadata = pd.DataFrame(raw_metadata)
8288
metadata = self.sanitize_metadata(metadata)
83-
metadata = filter_duplicates(metadata)
89+
metadata = filter_duplicates(metadata, original_service)
8490
metadata = pd.concat(
8591
[metadata, parse_annotations_for_all(metadata, "subject_orig")],
8692
axis=1,
@@ -234,7 +240,7 @@ def handle_contentproviders(self, request_id, params):
234240
pattern_annotations = re.compile(r"([A-Za-z]+:[\w'\- ]+);?")
235241

236242

237-
def filter_duplicates(df):
243+
def filter_duplicates(df, service):
238244
df.drop_duplicates("id", inplace=True, keep="first")
239245
df["is_anchor"] = False
240246
df["doi_duplicate"] = False
@@ -262,10 +268,13 @@ def filter_duplicates(df):
262268
df = remove_textual_duplicates_from_different_sources(df, dupind)
263269
df = add_false_negatives(df)
264270
df = mark_latest_doi(df, dupind)
271+
265272
pure_datasets = df[df.typenorm == "7"]
266273
non_datasets = df.loc[df.index.difference(pure_datasets.index)]
274+
267275
non_datasets = prioritize_OA_and_latest(non_datasets, dupind)
268-
non_datasets = prioritize_doi_and_provider(non_datasets, dupind)
276+
if service == "orcid":
277+
non_datasets = prioritize_doi_and_provider(non_datasets, dupind)
269278
pure_datasets = mark_latest_doi(pure_datasets, dupind)
270279

271280
pure_datasets_condition_mask = (pure_datasets.is_anchor == True) | (pure_datasets.is_duplicate == False)

server/workers/orcid/src/orcid_service.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ def request_base_metadata(self, dois: List[str], params: Dict[str, str]) -> pd.D
177177
'today': '2024-10-21',
178178
'unique_id': 'abf2625e2d84eb4367fb443e2cb6f4a1',
179179
'service': 'base',
180+
'original_service': 'orcid',
180181
'embed': 'false',
181182
'vis_id': 'abf2625e2d84eb4367fb443e2cb6f4a1',
182183
'limit': 120,

0 commit comments

Comments
 (0)