22import json
33import subprocess
44import pandas as pd
5+ import logging
56from common .r_wrapper import RWrapper
67from common .deduplication import (
78 find_version_in_doi ,
2930import sys
3031from common .rate_limiter import RateLimiter
3132
33+ logger = logging .getLogger (__name__ )
3234
3335class BaseClient (RWrapper ):
3436 def __init__ (self , * args ):
@@ -50,14 +52,18 @@ def next_item(self):
5052 message = json .loads (message .decode ("utf-8" ))
5153 request_id = message .get ("id" )
5254 params = self .add_default_params (message .get ("params" ))
55+ original_service = params .get ("original_service" )
5356 params ["service" ] = "base"
57+ if original_service :
58+ params ["original_service" ] = original_service
5459 endpoint = message .get ("endpoint" )
5560 self .logger .debug (f"Request ID: { request_id } , Params: { params } , Endpoint: { endpoint } " )
5661 return request_id , params , endpoint
5762
5863 def execute_search (self , params ):
5964 q = params .get ("q" )
6065 service = params .get ("service" )
66+ original_service = params .get ("original_service" , service )
6167 data = {}
6268 data ["params" ] = params
6369 cmd = [self .command , self .runner , self .wd , q , service ]
@@ -80,7 +86,7 @@ def execute_search(self, params):
8086 else :
8187 metadata = pd .DataFrame (raw_metadata )
8288 metadata = self .sanitize_metadata (metadata )
83- metadata = filter_duplicates (metadata )
89+ metadata = filter_duplicates (metadata , original_service )
8490 metadata = pd .concat (
8591 [metadata , parse_annotations_for_all (metadata , "subject_orig" )],
8692 axis = 1 ,
@@ -234,7 +240,7 @@ def handle_contentproviders(self, request_id, params):
234240pattern_annotations = re .compile (r"([A-Za-z]+:[\w'\- ]+);?" )
235241
236242
237- def filter_duplicates (df ):
243+ def filter_duplicates (df , service ):
238244 df .drop_duplicates ("id" , inplace = True , keep = "first" )
239245 df ["is_anchor" ] = False
240246 df ["doi_duplicate" ] = False
@@ -262,10 +268,13 @@ def filter_duplicates(df):
262268 df = remove_textual_duplicates_from_different_sources (df , dupind )
263269 df = add_false_negatives (df )
264270 df = mark_latest_doi (df , dupind )
271+
265272 pure_datasets = df [df .typenorm == "7" ]
266273 non_datasets = df .loc [df .index .difference (pure_datasets .index )]
274+
267275 non_datasets = prioritize_OA_and_latest (non_datasets , dupind )
268- non_datasets = prioritize_doi_and_provider (non_datasets , dupind )
276+ if service == "orcid" :
277+ non_datasets = prioritize_doi_and_provider (non_datasets , dupind )
269278 pure_datasets = mark_latest_doi (pure_datasets , dupind )
270279
271280 pure_datasets_condition_mask = (pure_datasets .is_anchor == True ) | (pure_datasets .is_duplicate == False )
0 commit comments