diff --git a/biii-import/biseEU_LD_export.py b/biii-import/biseEU_LD_export.py index b274745..5870110 100644 --- a/biii-import/biseEU_LD_export.py +++ b/biii-import/biseEU_LD_export.py @@ -412,7 +412,7 @@ def rdfize_bioschema_tool(json_entry): for item in entry["field_has_function"]: if "target_uuid" in item.keys(): - if not "featureList" in out.keys(): + if "featureList" not in out.keys(): out["featureList"] = [{"@id": item["target_uuid"]}] else: out["featureList"].append({"@id": item["target_uuid"]}) @@ -424,7 +424,7 @@ def rdfize_bioschema_tool(json_entry): for item in entry["field_has_topic"]: if "target_uuid" in item.keys(): - if not "hasTopic" in out.keys(): + if "hasTopic" not in out.keys(): out["hasTopic"] = [{"@id": item["target_uuid"]}] # print(f"Added first topic {item['target_uuid']}") else: @@ -432,7 +432,7 @@ def rdfize_bioschema_tool(json_entry): # print(f"Added another topic {item['target_uuid']}") for item in entry["field_has_reference_publication"]: - if not "citation" in out.keys(): + if "citation" not in out.keys(): out["citation"] = [] if item["uri"]: out["citation"].append({"@id": item["uri"].strip()}) @@ -440,13 +440,13 @@ def rdfize_bioschema_tool(json_entry): out["citation"].append(item["title"]) for item in entry["field_has_license"]: - if not "license" in out.keys(): + if "license" not in out.keys(): out["license"] = [] if item["value"]: out["license"].append(item["value"]) for item in entry["field_has_author"]: - if not "publisher" in out.keys(): + if "publisher" not in out.keys(): out["publisher"] = [] if item["value"]: out["publisher"].append(item["value"]) @@ -462,12 +462,12 @@ def rdfize_bioschema_tool(json_entry): out["dateModified"] = str(date.isoformat()) for item in entry["field_is_dependent_of"]: - if not "softwareRequirements" in out.keys(): + if "softwareRequirements" not in out.keys(): out["softwareRequirements"] = [] if item["target_id"]: - out["softwareRequirements"].append({ - "@id": "http://biii.eu/node/" + str(item["target_id"]).strip() - }) + out["softwareRequirements"].append( + {"@id": "http://biii.eu/node/" + str(item["target_id"]).strip()} + ) out.update(ctx) @@ -536,13 +536,13 @@ def rdfize(json_entry): entry["hasTitle"] = entry["title"][0]["value"] for item in entry["field_image"]: - if not "hasIllustration" in entry.keys(): + if "hasIllustration" not in entry.keys(): entry["hasIllustration"] = [item["url"]] else: entry["hasIllustration"].append(item["url"]) for item in entry["field_has_author"]: - if not "hasAuthor" in entry.keys(): + if "hasAuthor" not in entry.keys(): entry["hasAuthor"] = [] if item["value"]: entry["hasAuthor"].append(item["value"]) @@ -553,7 +553,7 @@ def rdfize(json_entry): for item in entry["field_has_function"]: # print(item) if "target_uuid" in item.keys(): - if not "hasFunction" in entry.keys(): + if "hasFunction" not in entry.keys(): entry["hasFunction"] = [{"@id": item["target_uuid"]}] else: entry["hasFunction"].append({"@id": item["target_uuid"]}) @@ -561,7 +561,7 @@ def rdfize(json_entry): for item in entry["field_has_topic"]: # print(item) if "target_uuid" in item.keys(): - if not "hasTopic" in entry.keys(): + if "hasTopic" not in entry.keys(): entry["hasTopic"] = [{"@id": item["target_uuid"]}] else: entry["hasTopic"].append({"@id": item["target_uuid"]}) @@ -569,44 +569,44 @@ def rdfize(json_entry): for item in entry["field_is_dependent_of"]: # print(item) if "target_id" in item.keys(): - if not "requires" in entry.keys(): + if "requires" not in entry.keys(): entry["requires"] = [ {"@id": "http://biii.eu/node/" + str(item["target_id"])} ] else: - entry["requires"].append({ - "@id": "http://biii.eu/node/" + str(item["target_id"]) - }) + entry["requires"].append( + {"@id": "http://biii.eu/node/" + str(item["target_id"])} + ) for item in entry["field_has_reference_publication"]: - if not "citation" in entry.keys(): + if "citation" not in entry.keys(): entry["citation"] = [] if item["uri"]: - entry["citation"].append({ - "@id": urllib.parse.quote(item["uri"], safe=":/") - }) + entry["citation"].append( + {"@id": urllib.parse.quote(item["uri"], safe=":/")} + ) if item["title"]: entry["citation"].append(item["title"]) for item in entry["field_has_location"]: - if not "location" in entry.keys(): + if "location" not in entry.keys(): entry["location"] = [] if item["uri"]: - entry["location"].append({ - "@id": urllib.parse.quote(item["uri"], safe=":/") - }) + entry["location"].append( + {"@id": urllib.parse.quote(item["uri"], safe=":/")} + ) if item["title"]: entry["location"].append(item["title"]) for item in entry["field_has_license"]: - if not "license" in entry.keys(): + if "license" not in entry.keys(): entry["license"] = [] if item["value"]: entry["license"].append(item["value"]) for item in entry["field_license_openness"]: if "target_id" in item.keys(): - if not "openess" in entry.keys(): + if "openess" not in entry.keys(): entry["openess"] = [ { "@id": "http://biii.eu/taxonomy/term/" @@ -614,14 +614,17 @@ def rdfize(json_entry): } ] else: - entry["openess"].append({ - "@id": "http://biii.eu/taxonomy/term/" + str(item["target_id"]) - }) + entry["openess"].append( + { + "@id": "http://biii.eu/taxonomy/term/" + + str(item["target_id"]) + } + ) for item in entry["field_has_implementation"]: # print(item) if "target_id" in item.keys(): - if not "hasImplementation" in entry.keys(): + if "hasImplementation" not in entry.keys(): entry["hasImplementation"] = [ { "@id": "http://biii.eu/taxonomy/term/" @@ -629,13 +632,16 @@ def rdfize(json_entry): } ] else: - entry["hasImplementation"].append({ - "@id": "http://biii.eu/taxonomy/term/" + str(item["target_id"]) - }) + entry["hasImplementation"].append( + { + "@id": "http://biii.eu/taxonomy/term/" + + str(item["target_id"]) + } + ) for item in entry["field_type"]: if "target_id" in item.keys(): - if not "hasType" in entry.keys(): + if "hasType" not in entry.keys(): entry["hasType"] = [ { "@id": "http://biii.eu/taxonomy/term/" @@ -643,13 +649,16 @@ def rdfize(json_entry): } ] else: - entry["hasType"].append({ - "@id": "http://biii.eu/taxonomy/term/" + str(item["target_id"]) - }) + entry["hasType"].append( + { + "@id": "http://biii.eu/taxonomy/term/" + + str(item["target_id"]) + } + ) for item in entry["field_has_programming_language"]: if "target_id" in item.keys(): - if not "hasProgrammingLanguage" in entry.keys(): + if "hasProgrammingLanguage" not in entry.keys(): entry["hasProgrammingLanguage"] = [ { "@id": "http://biii.eu/taxonomy/term/" @@ -657,13 +666,16 @@ def rdfize(json_entry): } ] else: - entry["hasProgrammingLanguage"].append({ - "@id": "http://biii.eu/taxonomy/term/" + str(item["target_id"]) - }) + entry["hasProgrammingLanguage"].append( + { + "@id": "http://biii.eu/taxonomy/term/" + + str(item["target_id"]) + } + ) for item in entry["field_platform"]: if "target_id" in item.keys(): - if not "hasPlatform" in entry.keys(): + if "hasPlatform" not in entry.keys(): entry["hasPlatform"] = [ { "@id": "http://biii.eu/taxonomy/term/" @@ -671,13 +683,16 @@ def rdfize(json_entry): } ] else: - entry["hasPlatform"].append({ - "@id": "http://biii.eu/taxonomy/term/" + str(item["target_id"]) - }) + entry["hasPlatform"].append( + { + "@id": "http://biii.eu/taxonomy/term/" + + str(item["target_id"]) + } + ) for item in entry["field_supported_image_dimension"]: if "target_id" in item.keys(): - if not "hasSupportedImageDimension" in entry.keys(): + if "hasSupportedImageDimension" not in entry.keys(): entry["hasSupportedImageDimension"] = [ { "@id": "http://biii.eu/taxonomy/term/" @@ -685,58 +700,61 @@ def rdfize(json_entry): } ] else: - entry["hasSupportedImageDimension"].append({ - "@id": "http://biii.eu/taxonomy/term/" + str(item["target_id"]) - }) + entry["hasSupportedImageDimension"].append( + { + "@id": "http://biii.eu/taxonomy/term/" + + str(item["target_id"]) + } + ) for item in entry["field_is_covered_by_training_mat"]: if "target_id" in item.keys(): - if not "hasTrainingMaterial" in entry.keys(): + if "hasTrainingMaterial" not in entry.keys(): entry["hasTrainingMaterial"] = [ {"@id": "http://biii.eu/node/" + str(item["target_id"])} ] else: - entry["hasTrainingMaterial"].append({ - "@id": "http://biii.eu/node/" + str(item["target_id"]) - }) + entry["hasTrainingMaterial"].append( + {"@id": "http://biii.eu/node/" + str(item["target_id"])} + ) for item in entry["field_has_documentation"]: - if not "uri" in entry.keys(): + if "uri" not in entry.keys(): entry["hasDocumentation"] = [] if item["uri"]: - entry["hasDocumentation"].append({ - "@id": urllib.parse.quote(item["uri"], safe=":/") - }) + entry["hasDocumentation"].append( + {"@id": urllib.parse.quote(item["uri"], safe=":/")} + ) if item["title"]: entry["hasDocumentation"].append(item["title"]) for item in entry["field_has_comparison"]: - if not "uri" in entry.keys(): + if "uri" not in entry.keys(): entry["hasComparison"] = [] if item["uri"]: - entry["hasComparison"].append({ - "@id": urllib.parse.quote(item["uri"], safe=":/") - }) + entry["hasComparison"].append( + {"@id": urllib.parse.quote(item["uri"], safe=":/")} + ) if item["title"]: entry["hasComparison"].append(item["title"]) for item in entry["field_has_usage_example"]: - if not "uri" in entry.keys(): + if "uri" not in entry.keys(): entry["hasUsageExample"] = [] if item["uri"]: - entry["hasUsageExample"].append({ - "@id": urllib.parse.quote(item["uri"], safe=":/") - }) + entry["hasUsageExample"].append( + {"@id": urllib.parse.quote(item["uri"], safe=":/")} + ) if item["title"]: entry["hasUsageExample"].append(item["title"]) for item in entry["field_has_doi"]: - if not "uri" in entry.keys(): + if "uri" not in entry.keys(): entry["hasDOI"] = [] if item["uri"]: - entry["hasDOI"].append({ - "@id": urllib.parse.quote(item["uri"], safe=":/") - }) + entry["hasDOI"].append( + {"@id": urllib.parse.quote(item["uri"], safe=":/")} + ) if item["title"]: entry["hasDOI"].append(item["title"]) @@ -796,7 +814,7 @@ def get_software_list(connection): except urllib3.exceptions.HTTPError as e: print("Connection error") print(e) - except json.decoder.JSONDecodeError as e: + except json.decoder.JSONDecodeError: print(f"Contents are not propertly formatted in {req.geturl()}") print("Response contents:", req.data) return None diff --git a/bioconda-import/bioconda_importer.py b/bioconda-import/bioconda_importer.py index 0c3b132..d2aa19f 100644 --- a/bioconda-import/bioconda_importer.py +++ b/bioconda-import/bioconda_importer.py @@ -6,6 +6,7 @@ from pathlib import Path import jinja2 + def clean(content_path): import_directory = os.path.join(content_path, "imports", "bioconda") os.makedirs(import_directory, exist_ok=True) @@ -14,6 +15,7 @@ def clean(content_path): for data_file in Path(content_path).glob("data/*/bioconda_*.yaml"): os.remove(data_file) + def parse_bioconda(directory): """ Get bioconda content data into memory. @@ -24,11 +26,19 @@ def parse_bioconda(directory): class SilentUndefined(jinja2.Undefined): def __str__(self): return "" + __repr__ = __str__ - __bool__ = lambda self: False + + def __bool__(self): + return False + __getattr__ = __getitem__ = lambda self, *a, **kw: self - __iter__ = lambda self: iter(()) - __call__ = lambda self, *a, **kw: self + + def __iter__(self): + return iter(()) + + def __call__(self, *a, **kw): + return self # load custom Undefined class in custom environment env = jinja2.Environment(undefined=SilentUndefined) @@ -51,30 +61,42 @@ def __str__(self): return data + def merge(conda, content_path): - bioconda_import_path = os.path.join(content_path, 'imports', 'bioconda') - biotools_data_path = os.path.join(content_path, 'data') + bioconda_import_path = os.path.join(content_path, "imports", "bioconda") + biotools_data_path = os.path.join(content_path, "data") for name, data in conda.items(): try: - package_name = data['package']['name'] - import_file_path = os.path.join(bioconda_import_path, f"bioconda_{package_name}.yaml") + package_name = data["package"]["name"] + import_file_path = os.path.join( + bioconda_import_path, f"bioconda_{package_name}.yaml" + ) with open(import_file_path, "w") as out: yaml.dump(data, out) - extra = data.get('extra') # safely returns None if 'extra' not in data - if not extra or 'identifiers' not in extra: + extra = data.get("extra") # safely returns None if 'extra' not in data + if not extra or "identifiers" not in extra: continue - biotools_ids = [ident.split(':')[1].lower() for ident in data['extra']['identifiers'] if ident.startswith('biotools:')] + biotools_ids = [ + ident.split(":")[1].lower() + for ident in data["extra"]["identifiers"] + if ident.startswith("biotools:") + ] for biotools_id in biotools_ids: - biotools_file_path = os.path.join(biotools_data_path, biotools_id, f"bioconda_{package_name}.yaml") + biotools_file_path = os.path.join( + biotools_data_path, biotools_id, f"bioconda_{package_name}.yaml" + ) try: with open(biotools_file_path, "w") as out: yaml.dump(data, out) except FileNotFoundError: print(f"Error trying to create the file {biotools_file_path}") except (KeyError, TypeError) as e: - print(f"Error processing {name}: missing or invalid package structure ({type(e).__name__}: {e})") + print( + f"Error processing {name}: missing or invalid package structure ({type(e).__name__}: {e})" + ) continue + class readable_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): prospective_dir = values @@ -89,8 +111,11 @@ def __call__(self, parser, namespace, values, option_string=None): "readable_dir:{0} is not a readable dir".format(prospective_dir) ) + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="bioconda import script", fromfile_prefix_chars="@") + parser = argparse.ArgumentParser( + description="bioconda import script", fromfile_prefix_chars="@" + ) parser.add_argument( "biotools", help="path to RSEc content dir, e.g. content/", diff --git a/bioconductor-import/import.py b/bioconductor-import/import.py index 851549d..13b6568 100644 --- a/bioconductor-import/import.py +++ b/bioconductor-import/import.py @@ -7,12 +7,15 @@ import yaml # Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger() # Bioconductor URL format BIOCONDUCTOR_BASE_URL = "https://bioconductor.org/packages/json/{}/bioc/packages.json" + def get_bioconductor_version(): """ Query Bioconductor to get the latest version from the config.yaml file. @@ -23,26 +26,27 @@ def get_bioconductor_version(): config_response = requests.get(config_url) config_response.raise_for_status() config = yaml.safe_load(config_response.text) - + # Extract the current release version - version = config.get('release_version') + version = config.get("release_version") if not version: logger.error("Release version not found in the config file.") return None - + logger.info(f"Detected latest Bioconductor version: {version}") return version except requests.RequestException as e: logger.error(f"Error fetching Bioconductor config: {e}") return None + def clean(): import_directory = os.path.join("imports", "bioconductor") os.makedirs(import_directory, exist_ok=True) # Get a list of all package files to be removed package_files = glob.glob(r"imports/bioconductor/*.bioconductor.json") - + # Count and remove files removed_count = len(package_files) for package_file in package_files: @@ -51,6 +55,7 @@ def clean(): # Log the number of files removed logger.info(f"Cleaned up {removed_count} previous package files.") + def retrieve(version, filters=None): """ Go through Bioconductor entries using its API for the provided version @@ -58,12 +63,14 @@ def retrieve(version, filters=None): that have 'Software' in the 'biocViews' key. """ if version is None: - logger.error("Unable to retrieve data because the Bioconductor version is not available.") + logger.error( + "Unable to retrieve data because the Bioconductor version is not available." + ) return logger.info(f"Fetching data for Bioconductor version {version}...") endpoint = BIOCONDUCTOR_BASE_URL.format(version) - + try: packs = requests.get(endpoint).json() except requests.RequestException as e: @@ -71,7 +78,11 @@ def retrieve(version, filters=None): return # Filter packages with 'Software' in the 'biocViews' key - software_packs = [pack for pack in packs.values() if "biocViews" in pack and "Software" in pack["biocViews"]] + software_packs = [ + pack + for pack in packs.values() + if "biocViews" in pack and "Software" in pack["biocViews"] + ] if not software_packs: logger.warning("No packages with 'Software' in 'biocViews' found.") @@ -79,19 +90,29 @@ def retrieve(version, filters=None): logger.info(f"Found {len(software_packs)} packages with 'Software' in 'biocViews'.") total_packs = len(software_packs) - + # Save the packages and log the progress for idx, pack in enumerate(software_packs, start=1): - package_name = pack['Package'].lower() - path = os.path.join("imports", "bioconductor", f"{package_name}.bioconductor.json") - + package_name = pack["Package"].lower() + path = os.path.join( + "imports", "bioconductor", f"{package_name}.bioconductor.json" + ) + try: with open(path, "w") as write_file: - json.dump(pack, write_file, sort_keys=True, indent=4, separators=(",", ": ")) + json.dump( + pack, write_file, sort_keys=True, indent=4, separators=(",", ": ") + ) logger.info(f"Saved {idx}/{total_packs} - {package_name}") try: - citation_html = requests.get(f"https://www.bioconductor.org/packages/release/bioc/citations/{pack['Package']}/citation.html").text - citation_path = os.path.join("imports", "bioconductor", f"{package_name}.bioconductor.citation.html") + citation_html = requests.get( + f"https://www.bioconductor.org/packages/release/bioc/citations/{pack['Package']}/citation.html" + ).text + citation_path = os.path.join( + "imports", + "bioconductor", + f"{package_name}.bioconductor.citation.html", + ) with open(citation_path, "w") as write_file: write_file.write(citation_html) except Exception as e: @@ -99,6 +120,7 @@ def retrieve(version, filters=None): except IOError as e: logger.error(f"Error saving package {package_name}: {e}") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Bioconductor import script") args = parser.parse_args() @@ -111,4 +133,3 @@ def retrieve(version, filters=None): # Retrieve new data based on the latest version retrieve(version) - diff --git a/bioconductor-to-biotools/bioconductor2biotools.py b/bioconductor-to-biotools/bioconductor2biotools.py index 26ff5c9..08e3dbc 100644 --- a/bioconductor-to-biotools/bioconductor2biotools.py +++ b/bioconductor-to-biotools/bioconductor2biotools.py @@ -11,7 +11,7 @@ BT[fa:fa-file bio.tools JSON] BCF[fa:fa-file Bioconductor Citation HTML JSON] EBT[fa:fa-file Existing bio.tools JSON] - PBP[fa:fa-cogs 1. process bioconductor package] + PBP[fa:fa-cogs 1. process bioconductor package] EP[fa:fa-cogs 2. extract publications] UPBT[fa:fa-cogs 3. update with previous data] BF --> PBP @@ -32,50 +32,55 @@ import os from bs4 import BeautifulSoup + def process_authors(author_str): """ Processes the author field, extracting names, roles, and ORCIDs, and filters only relevant authors. """ authors = [] - author_entries = re.split(r',(?![^\[]*\])', author_str) - + author_entries = re.split(r",(?![^\[]*\])", author_str) + for entry in author_entries: entry = entry.strip() - - roles_match = re.findall(r'\[([^\]]+)\]', entry) - roles = [role.strip() for group in roles_match for role in group.split(',')] - - orcid_match = re.search(r'\(<(https://orcid\.org/\d{4}-\d{4}-\d{4}-\d{4})>\)', entry) + + roles_match = re.findall(r"\[([^\]]+)\]", entry) + roles = [role.strip() for group in roles_match for role in group.split(",")] + + orcid_match = re.search( + r"\(<(https://orcid\.org/\d{4}-\d{4}-\d{4}-\d{4})>\)", entry + ) orcid = orcid_match.group(1) if orcid_match else None - - name_match = re.match(r'^[^\[\(<]+', entry) + + name_match = re.match(r"^[^\[\(<]+", entry) if name_match: type_role = [] author_entry = {"name": name_match.group(0).strip()} - if 'aut' in roles or 'cre' in roles or 'ctb' in roles: + if "aut" in roles or "cre" in roles or "ctb" in roles: author_entry["typeEntity"] = "Person" - elif 'fnd' in roles: + elif "fnd" in roles: author_entry["typeEntity"] = "Funding agency" - if 'ctb' in roles or 'fnd' in roles: + if "ctb" in roles or "fnd" in roles: type_role.append("Contributor") - if 'aut' in roles: + if "aut" in roles: type_role.append("Developer") - if 'cre' in roles: + if "cre" in roles: type_role.append("Maintainer") if orcid: author_entry["orcid"] = orcid if type_role: author_entry["typeRole"] = type_role authors.append(author_entry) - + return authors + def get_id(data): """ returns the bio.tools ID from the Bioconductor JSON data. """ return f"bioconductor-{data['Package'].lower()}" + def process_bioconductor_package(data): """ Converts a Bioconductor JSON entry into a bio.tools formatted dictionary. @@ -86,8 +91,18 @@ def process_bioconductor_package(data): "collectionID": ["BioConductor"], "credit": process_authors(data.get("Author", "")), "description": data.get("Description", ""), - "documentation": [{"type": ["User manual"], "url": f"http://bioconductor.org/packages/release/bioc/html/{data['Package']}.html"}], - "download": [{"type": "Source code", "url": f"http://bioconductor/packages/release/bioc/src/{data.get('source.ver', '')}"}], + "documentation": [ + { + "type": ["User manual"], + "url": f"http://bioconductor.org/packages/release/bioc/html/{data['Package']}.html", + } + ], + "download": [ + { + "type": "Source code", + "url": f"http://bioconductor/packages/release/bioc/src/{data.get('source.ver', '')}", + } + ], "homepage": f"http://bioconductor.org/packages/release/bioc/html/{data['Package']}.html", "language": ["R"], "license": data.get("License", ""), @@ -95,33 +110,43 @@ def process_bioconductor_package(data): "operatingSystem": ["Linux", "Mac", "Windows"], "owner": "bioconductor_import", "toolType": ["Command-line tool", "Library"], - "version": [data.get("Version", "")] + "version": [data.get("Version", "")], } + def extract_publications(citation_html): """ Extracts publication information from a Bioconductor citation HTML file. """ publications = [] soup = BeautifulSoup(citation_html, "html.parser") - + for link in soup.find_all("a", href=True): href = link["href"].strip() if "doi.org" in href: publications.append({"doi": href.split("doi.org/")[-1]}) - + return publications + def update_with_previous_data(new_data, previous_data): """ Updates the newly generated bio.tools data with select fields from a previous bio.tools JSON file. """ - keys_to_copy = ["additionDate", "biotoolsCURIE", "biotoolsID", "collectionID", "editPermission", "function"] + keys_to_copy = [ + "additionDate", + "biotoolsCURIE", + "biotoolsID", + "collectionID", + "editPermission", + "function", + ] for key in keys_to_copy: if key in previous_data: new_data[key] = previous_data[key] return new_data + def batch_process(input_dir, output_dir): """ Process all Bioconductor JSON and citation HTML files in the input directory. @@ -131,41 +156,48 @@ def batch_process(input_dir, output_dir): if filename.endswith(".json") and "bioconductor" in filename: base_name = filename.replace(".json", "") citation_file = os.path.join(input_dir, base_name + ".citation.html") - + with open(os.path.join(input_dir, filename), "r") as infile: data = json.load(infile) - output_file = os.path.join(output_dir, get_id(data) + ".biotools.json") + output_file = os.path.join(output_dir, get_id(data) + ".biotools.json") processed_data = process_bioconductor_package(data) - + if os.path.exists(citation_file): with open(citation_file, "r", encoding="utf-8") as cit_file: - processed_data["publication"] = extract_publications(cit_file.read()) - + processed_data["publication"] = extract_publications( + cit_file.read() + ) + with open(output_file, "w") as outfile: json.dump(processed_data, outfile, indent=4) + def main(): - parser = argparse.ArgumentParser(description="Process Bioconductor JSON and convert it to bio.tools JSON format.") + parser = argparse.ArgumentParser( + description="Process Bioconductor JSON and convert it to bio.tools JSON format." + ) subparsers = parser.add_subparsers(dest="mode", required=True) - + single_parser = subparsers.add_parser("single") single_parser.add_argument("bioconductor_json_file") single_parser.add_argument("bioconductor_citation_file") single_parser.add_argument("biotools_json_file") single_parser.add_argument("--previous-biotools-json-file", required=False) - + batch_parser = subparsers.add_parser("batch") batch_parser.add_argument("input_directory") batch_parser.add_argument("output_directory") - + args = parser.parse_args() - + if args.mode == "single": with open(args.bioconductor_json_file, "r") as infile: data = json.load(infile) processed_data = process_bioconductor_package(data) - with open(args.bioconductor_citation_file, "r", encoding="utf-8") as citation_file: + with open( + args.bioconductor_citation_file, "r", encoding="utf-8" + ) as citation_file: processed_data["publication"] = extract_publications(citation_file.read()) if args.previous_biotools_json_file: with open(args.previous_biotools_json_file, "r") as prevfile: @@ -176,5 +208,6 @@ def main(): elif args.mode == "batch": batch_process(args.input_directory, args.output_directory) + if __name__ == "__main__": main() diff --git a/biocontainers-import/biocontainers-importer.py b/biocontainers-import/biocontainers-importer.py index 5fcc127..ff36912 100644 --- a/biocontainers-import/biocontainers-importer.py +++ b/biocontainers-import/biocontainers-importer.py @@ -9,12 +9,12 @@ def import_biocontainers_annotations(url, content_data_path): r = requests.get(url, stream=True) if r.encoding is None: - r.encoding = 'utf-8' + r.encoding = "utf-8" annotations = yaml.safe_load(r.text) valid_tools = [] for key, value in annotations.items(): - tool_annotation_yaml = f'{content_data_path}/{key}/{key}.biocontainers.yaml' + tool_annotation_yaml = f"{content_data_path}/{key}/{key}.biocontainers.yaml" os.makedirs(os.path.dirname(tool_annotation_yaml), exist_ok=True) valid_tools.append(key) with open(tool_annotation_yaml, "w") as f: @@ -37,15 +37,24 @@ class readable_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): prospective_dir = values if not os.path.isdir(prospective_dir): - raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir)) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a valid path".format(prospective_dir) + ) if os.access(prospective_dir, os.R_OK): setattr(namespace, self.dest, prospective_dir) else: - raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir)) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a readable dir".format(prospective_dir) + ) -parser = argparse.ArgumentParser(description='test', fromfile_prefix_chars="@") -parser.add_argument("biotools", help="path to metadata dir, e.g. content/data/", type=str, action=readable_dir) +parser = argparse.ArgumentParser(description="test", fromfile_prefix_chars="@") +parser.add_argument( + "biotools", + help="path to metadata dir, e.g. content/data/", + type=str, + action=readable_dir, +) parser.add_argument("url", help="url to biocontainers annotations", type=str) args = parser.parse_args() diff --git a/bioschemas-gen/bioconda_bioschemas_dump.py b/bioschemas-gen/bioconda_bioschemas_dump.py index c444874..668a2fc 100644 --- a/bioschemas-gen/bioconda_bioschemas_dump.py +++ b/bioschemas-gen/bioconda_bioschemas_dump.py @@ -28,7 +28,7 @@ def process_tools(): rdf_graph.serialize( format="turtle", - destination="../../content/datasets/bioconda-dump.ttl" + destination="../../content/datasets/bioconda-dump.ttl", # destination=os.path.join(directory, tpe_id + "bioschemas.jsonld") ) diff --git a/bioschemas-gen/bioconda_to_bioschemas.py b/bioschemas-gen/bioconda_to_bioschemas.py index a1836aa..ce458ff 100644 --- a/bioschemas-gen/bioconda_to_bioschemas.py +++ b/bioschemas-gen/bioconda_to_bioschemas.py @@ -3,7 +3,6 @@ import yaml from pathlib import Path from rdflib import Graph -import json def getBiotoolsId(bioconda_data) -> str: diff --git a/bioschemas-gen/biocontainers_bioschemas_dump.py b/bioschemas-gen/biocontainers_bioschemas_dump.py index 9ad0d2c..c2370b5 100644 --- a/bioschemas-gen/biocontainers_bioschemas_dump.py +++ b/bioschemas-gen/biocontainers_bioschemas_dump.py @@ -28,7 +28,7 @@ def process_tools(): rdf_graph.serialize( format="turtle", - destination="../../content/datasets/biocontainers-dump.ttl" + destination="../../content/datasets/biocontainers-dump.ttl", # destination=os.path.join(directory, tpe_id + "bioschemas.jsonld") ) diff --git a/bioschemas-gen/biotools_bioschemas_dump.py b/bioschemas-gen/biotools_bioschemas_dump.py index 33c8b66..81c3fb1 100644 --- a/bioschemas-gen/biotools_bioschemas_dump.py +++ b/bioschemas-gen/biotools_bioschemas_dump.py @@ -11,7 +11,11 @@ def get_bioschemas_files_in_repo(): if len(filename_ext) == 3 and filename_ext[2] == "jsonld": tools.append(data_file) print(f"found {len(tools)} bioschemas descriptors") - with open("../../content/datasets/biotools_bioschemas_files_list.txt", "w", encoding="utf-8") as f: + with open( + "../../content/datasets/biotools_bioschemas_files_list.txt", + "w", + encoding="utf-8", + ) as f: for tool in tools: f.write(f"{tool}\n") return tools @@ -31,7 +35,7 @@ def process_tools(): rdf_graph.serialize( format="turtle", - destination="../../content/datasets/bioschemas-dump.ttl" + destination="../../content/datasets/bioschemas-dump.ttl", # destination=os.path.join(directory, tpe_id + "bioschemas.jsonld") ) diff --git a/bioschemas-gen/biotools_to_bioschemas.py b/bioschemas-gen/biotools_to_bioschemas.py index 1ee05df..1d212ba 100644 --- a/bioschemas-gen/biotools_to_bioschemas.py +++ b/bioschemas-gen/biotools_to_bioschemas.py @@ -73,19 +73,21 @@ def rdfize(json_entry): if "typeEntity" in credit.keys() and credit["typeEntity"]: if "Funding agency" in credit["typeEntity"]: sType = "sc:Organization" - if "orcidid" in credit.keys() and credit["orcidid"] != None: - if not "funder" in entry.keys(): + if "orcidid" in credit.keys() and credit["orcidid"] is not None: + if "funder" not in entry.keys(): entry["funder"] = { "@id": credit["orcidid"], "@type": sType, } else: - entry["funder"].append({ - "@id": credit["orcidid"], - "@type": sType, - }) - elif "name" in credit.keys() and credit["name"] != None: - if not "funder" in entry.keys(): + entry["funder"].append( + { + "@id": credit["orcidid"], + "@type": sType, + } + ) + elif "name" in credit.keys() and credit["name"] is not None: + if "funder" not in entry.keys(): entry["funder"] = [credit["name"]] else: entry["funder"].append(credit["name"]) @@ -100,25 +102,30 @@ def rdfize(json_entry): sType = "sc:Person" else: sType = "sc:Organization" - if "orcidid" in credit.keys() and credit["orcidid"] != None: - if not "author" in entry.keys(): + if ( + "orcidid" in credit.keys() + and credit["orcidid"] is not None + ): + if "author" not in entry.keys(): entry["author"] = { "@id": credit["orcidid"], "@type": sType, } else: - entry["author"].append({ - "@id": credit["orcidid"], - "@type": sType, - }) - elif "name" in credit.keys() and credit["name"] != None: - if not "author" in entry.keys(): + entry["author"].append( + { + "@id": credit["orcidid"], + "@type": sType, + } + ) + elif "name" in credit.keys() and credit["name"] is not None: + if "author" not in entry.keys(): entry["author"] = [credit["name"]] else: entry["author"].append(credit["name"]) else: - if "name" in credit.keys() and credit["name"] != None: - if not "author" in entry.keys(): + if "name" in credit.keys() and credit["name"] is not None: + if "author" not in entry.keys(): entry["author"] = [credit["name"]] else: entry["author"].append(credit["name"]) @@ -132,8 +139,11 @@ def rdfize(json_entry): else: sType = "sc:Organization" - if "orcidid" in credit.keys() and credit["orcidid"] != None: - if not "provider" in entry.keys(): + if ( + "orcidid" in credit.keys() + and credit["orcidid"] is not None + ): + if "provider" not in entry.keys(): entry["provider"] = { "@id": credit["orcidid"], "@type": sType, @@ -141,18 +151,20 @@ def rdfize(json_entry): # if 'name' in credit.keys() and credit['name'] != None: # entry['author_person']['name'] = credit['name'] else: - entry["provider"].append({ - "@id": credit["orcidid"], - "@type": sType, - }) - elif "name" in credit.keys() and credit["name"] != None: - if not "provider" in entry.keys(): + entry["provider"].append( + { + "@id": credit["orcidid"], + "@type": sType, + } + ) + elif "name" in credit.keys() and credit["name"] is not None: + if "provider" not in entry.keys(): entry["provider"] = [credit["name"]] else: entry["provider"].append(credit["name"]) else: - if "name" in credit.keys() and credit["name"] != None: - if not "provider" in entry.keys(): + if "name" in credit.keys() and credit["name"] is not None: + if "provider" not in entry.keys(): entry["provider"] = [credit["name"]] else: entry["provider"].append(credit["name"]) @@ -167,25 +179,30 @@ def rdfize(json_entry): else: sType = "sc:Organization" - if "orcidid" in credit.keys() and credit["orcidid"] != None: - if not "contributor" in entry.keys(): + if ( + "orcidid" in credit.keys() + and credit["orcidid"] is not None + ): + if "contributor" not in entry.keys(): entry["contributor"] = { "@id": credit["orcidid"], "@type": sType, } else: - entry["contributor"].append({ - "@id": credit["orcidid"], - "@type": sType, - }) - elif "name" in credit.keys() and credit["name"] != None: - if not "contributor" in entry.keys(): + entry["contributor"].append( + { + "@id": credit["orcidid"], + "@type": sType, + } + ) + elif "name" in credit.keys() and credit["name"] is not None: + if "contributor" not in entry.keys(): entry["contributor"] = [credit["name"]] else: entry["contributor"].append(credit["name"]) else: - if "name" in credit.keys() and credit["name"] != None: - if not "contributor" in entry.keys(): + if "name" in credit.keys() and credit["name"] is not None: + if "contributor" not in entry.keys(): entry["contributor"] = [credit["name"]] else: entry["contributor"].append(credit["name"]) @@ -200,25 +217,30 @@ def rdfize(json_entry): else: sType = "sc:Organization" - if "orcidid" in credit.keys() and credit["orcidid"] != None: - if not "primaryContact" in entry.keys(): + if ( + "orcidid" in credit.keys() + and credit["orcidid"] is not None + ): + if "primaryContact" not in entry.keys(): entry["primaryContact"] = { "@id": credit["orcidid"], "@type": sType, } else: - entry["primaryContact"].append({ - "@id": credit["orcidid"], - "@type": sType, - }) - elif "name" in credit.keys() and credit["name"] != None: - if not "primaryContact" in entry.keys(): + entry["primaryContact"].append( + { + "@id": credit["orcidid"], + "@type": sType, + } + ) + elif "name" in credit.keys() and credit["name"] is not None: + if "primaryContact" not in entry.keys(): entry["primaryContact"] = [credit["name"]] else: entry["primaryContact"].append(credit["name"]) else: - if "name" in credit.keys() and credit["name"] != None: - if not "primaryContact" in entry.keys(): + if "name" in credit.keys() and credit["name"] is not None: + if "primaryContact" not in entry.keys(): entry["primaryContact"] = [credit["name"]] else: entry["primaryContact"].append(credit["name"]) @@ -226,20 +248,20 @@ def rdfize(json_entry): if entry.get("publication"): for publication in entry["publication"]: if publication.get("pmid"): - if not "hasPublication" in entry.keys(): + if "hasPublication" not in entry.keys(): # entry['hasPublication'] = [{"@id": 'pubmed:' + publication['pmid']}] entry["hasPublication"] = ["pubmed:" + publication["pmid"]] else: # entry['hasPublication'].append({"@id": 'pubmed:' + publication['pmid']}) entry["hasPublication"].append("pubmed:" + publication["pmid"]) if publication.get("pmcid"): - if not "hasPublication" in entry.keys(): + if "hasPublication" not in entry.keys(): entry["hasPublication"] = ["pmcid:" + publication["pmcid"]] else: entry["hasPublication"].append("pmcid:" + publication["pmcid"]) if publication.get("doi"): if not ("<" in publication["doi"] or ">" in publication["doi"]): - if not "hasPublication" in entry.keys(): + if "hasPublication" not in entry.keys(): entry["hasPublication"] = [ { "@id": "https://doi.org/" + publication["doi"], @@ -247,16 +269,18 @@ def rdfize(json_entry): } ] else: - entry["hasPublication"].append({ - "@id": "https://doi.org/" + publication["doi"], - "@type": "sc:CreativeWork", - }) + entry["hasPublication"].append( + { + "@id": "https://doi.org/" + publication["doi"], + "@type": "sc:CreativeWork", + } + ) if entry.get("function"): for item in entry["function"]: if item.get("operation"): for op in item["operation"]: - if not "hasOperation" in entry.keys(): + if "hasOperation" not in entry.keys(): entry["hasOperation"] = [{"@id": op["uri"]}] else: entry["hasOperation"].append({"@id": op["uri"]}) @@ -272,7 +296,7 @@ def rdfize(json_entry): if input.get("format"): for f in input["format"]: input_object["encodingFormat"].append({"@id": f["uri"]}) - if not "hasInputData" in entry.keys(): + if "hasInputData" not in entry.keys(): entry["hasInputData"] = [input_object] else: entry["hasInputData"].append(input_object) @@ -287,24 +311,24 @@ def rdfize(json_entry): } if output.get("format"): for f in output["format"]: - output_object["encodingFormat"].append({ - "@id": f["uri"] - }) - if not "hasOutputData" in entry.keys(): + output_object["encodingFormat"].append( + {"@id": f["uri"]} + ) + if "hasOutputData" not in entry.keys(): entry["hasOutputData"] = [output_object] else: entry["hasOutputData"].append(output_object) if entry.get("topic"): for item in entry["topic"]: - if not "hasTopic" in entry.keys(): + if "hasTopic" not in entry.keys(): entry["hasTopic"] = [{"@id": item["uri"]}] else: entry["hasTopic"].append({"@id": item["uri"]}) if entry.get("cost"): for item in entry["cost"]: - if not "isAccessibleForFree" in entry.keys(): + if "isAccessibleForFree" not in entry.keys(): if "Free" in entry["cost"]: entry["isAccessibleForFree"] = True else: @@ -315,17 +339,17 @@ def rdfize(json_entry): if "type" in item.keys() and item["type"]: item["url"] = item["url"].replace("|", "%7C") if "API" in item["type"]: - if not "hasApiDoc" in entry.keys(): + if "hasApiDoc" not in entry.keys(): entry["hasApiDoc"] = [{"@id": item["url"]}] else: entry["hasApiDoc"].append({"@id": item["url"]}) elif "Terms" in item["type"]: - if not "hasTermsOfUse" in entry.keys(): + if "hasTermsOfUse" not in entry.keys(): entry["hasTermsOfUse"] = [{"@id": item["url"]}] else: entry["hasTermsOfUse"].append({"@id": item["url"]}) else: - if not "hasGenDoc" in entry.keys(): + if "hasGenDoc" not in entry.keys(): entry["hasGenDoc"] = [{"@id": item["url"]}] else: entry["hasGenDoc"].append({"@id": item["url"]}) diff --git a/bioschemas-gen/debian_bioschemas_dump.py b/bioschemas-gen/debian_bioschemas_dump.py index 931f8d1..d766370 100644 --- a/bioschemas-gen/debian_bioschemas_dump.py +++ b/bioschemas-gen/debian_bioschemas_dump.py @@ -28,7 +28,7 @@ def process_tools(): rdf_graph.serialize( format="turtle", - destination="../../content/datasets/debian-dump.ttl" + destination="../../content/datasets/debian-dump.ttl", # destination=os.path.join(directory, tpe_id + "bioschemas.jsonld") ) diff --git a/bioschemas-gen/debian_to_bioschemas.py b/bioschemas-gen/debian_to_bioschemas.py index dc76125..3e3dffe 100644 --- a/bioschemas-gen/debian_to_bioschemas.py +++ b/bioschemas-gen/debian_to_bioschemas.py @@ -9,35 +9,42 @@ def getBiotoolsIdFromDebian(debian_data) -> str: """ Get the bio.tools ID from the debian data. """ - if 'registries' in debian_data.keys(): - for r in debian_data['registries']: - if 'name' in r.keys() and r['name'] == 'bio.tools': - return r['entry'] + if "registries" in debian_data.keys(): + for r in debian_data["registries"]: + if "name" in r.keys() and r["name"] == "bio.tools": + return r["entry"] return None + def getCitationFromDebian(debian_data) -> list: """ Get DOIs from the debian data. """ res = [] - if 'bib' in debian_data.keys(): - for entry in debian_data['bib']: - if 'key' in entry.keys() and 'value' in entry.keys(): - res.append(entry['key'] + ":" + entry['value']) + if "bib" in debian_data.keys(): + for entry in debian_data["bib"]: + if "key" in entry.keys() and "value" in entry.keys(): + res.append(entry["key"] + ":" + entry["value"]) return res + def getDescriptionFromDebian(debian_data) -> str: """ Get tool descriptions from the debian data. """ - if 'descr' in debian_data.keys(): - for entry in debian_data['descr']: - if 'language' in entry.keys() and 'description' in entry.keys() and entry['language'] == "en": - return entry['description'] - # elif 'language' in entry.keys() and 'long_description' in entry.keys() and entry['language'] == "en": - # return entry['long_description'] + if "descr" in debian_data.keys(): + for entry in debian_data["descr"]: + if ( + "language" in entry.keys() + and "description" in entry.keys() + and entry["language"] == "en" + ): + return entry["description"] + # elif 'language' in entry.keys() and 'long_description' in entry.keys() and entry['language'] == "en": + # return entry['long_description'] return None + def rdfize(data) -> Graph: prefix = """ @prefix rdf: . @@ -55,46 +62,47 @@ def rdfize(data) -> Graph: description = getDescriptionFromDebian(data) try: - if "package" in data.keys() : - package_uri = f"debianmed:{data["package"]}" - triples += f'{package_uri} rdf:type schema:SoftwareApplication .\n' + if "package" in data.keys(): + package_uri = f"debianmed:{data['package']}" + triples += f"{package_uri} rdf:type schema:SoftwareApplication .\n" triples += f'{package_uri} schema:name "{data["package"]}" .\n' # if "description" in data.keys() : # triples += f'{package_uri} schema:description "{data["description"]}" .\n' - if "license" in data.keys() : + if "license" in data.keys(): triples += f'{package_uri} schema:license "{data["license"]}" .\n' - if "version" in data.keys() : + if "version" in data.keys(): triples += f'{package_uri} schema:softwareVersion "{data["version"]}" .\n' - if biotools_id : + if biotools_id: triples += f'{package_uri} spdx:builtFrom "{biotools_id}" .\n' - if description : + if description: triples += f'{package_uri} schema:description "{description}" .\n' if "homepage" in data.keys(): triples += f'{package_uri} schema:url "{data["homepage"]}" .\n' if "tags" in data.keys(): for kw in data["tags"]: - if 'tag' in kw.keys(): + if "tag" in kw.keys(): triples += f'{package_uri} schema:keywords "{kw["tag"]}" .\n' # process DOIs for doi in dois: triples += f'{package_uri} schema:citation "{doi}" .\n' # process identifiers - if 'registries' in data.keys(): - for e in data['registries']: - if 'name' in e.keys() and "entry" in e.keys(): + if "registries" in data.keys(): + for e in data["registries"]: + if "name" in e.keys() and "entry" in e.keys(): id = f"{e['name'].lower()}:{e['entry']}" triples += f'{package_uri} schema:identifier "{id}" .\n' g = Graph() - g.parse(data=prefix+"\n"+triples, format="turtle") - print(g.serialize(format='turtle')) + g.parse(data=prefix + "\n" + triples, format="turtle") + print(g.serialize(format="turtle")) return g except Exception as e: print("PARSING ERROR for:") - print(prefix+"\n"+triples) - raise(e) + print(prefix + "\n" + triples) + raise (e) + def get_biotools_files_in_repo(): tools = [] @@ -125,15 +133,14 @@ def process_tools_by_id(id="SPROUT"): temp_graph.serialize( format="json-ld", auto_compact=True, - destination=os.path.join( - directory, tpe_id + ".debian.jsonld" - ), + destination=os.path.join(directory, tpe_id + ".debian.jsonld"), ) temp_graph.serialize( format="turtle", destination=os.path.join(directory, tpe_id + ".debian.ttl"), ) + def clean(): for data_file in glob.glob(r"../../content/data/*/*.debian.jsonld"): print(f"removing file {data_file}") @@ -142,6 +149,7 @@ def clean(): print(f"removing file {data_file}") os.remove(data_file) + def process_tools(): """ Go through all bio.tools entries and produce an RDF graph representation (BioSchemas / JSON-LD). diff --git a/bioschemas-gen/galaxy_bioschemas_dump.py b/bioschemas-gen/galaxy_bioschemas_dump.py index 7098377..5d84575 100644 --- a/bioschemas-gen/galaxy_bioschemas_dump.py +++ b/bioschemas-gen/galaxy_bioschemas_dump.py @@ -28,7 +28,7 @@ def process_tools(): rdf_graph.serialize( format="turtle", - destination="../../content/datasets/galaxy-dump.ttl" + destination="../../content/datasets/galaxy-dump.ttl", # destination=os.path.join(directory, tpe_id + "bioschemas.jsonld") ) diff --git a/bioschemas-gen/galaxy_to_bioschemas.py b/bioschemas-gen/galaxy_to_bioschemas.py index 7d9b157..e28ccaf 100644 --- a/bioschemas-gen/galaxy_to_bioschemas.py +++ b/bioschemas-gen/galaxy_to_bioschemas.py @@ -5,35 +5,37 @@ from rdflib import Graph from rdflib import ConjunctiveGraph -edam_version = 'https://github.com/edamontology/edamontology/raw/main/EDAM_dev.owl' +edam_version = "https://github.com/edamontology/edamontology/raw/main/EDAM_dev.owl" kg = ConjunctiveGraph() -kg.parse(edam_version, format='xml') +kg.parse(edam_version, format="xml") -def getEdamUrisFromLabels(edam_labels) -> list : - """ - Get EDAM URIs from EDAM labels. - """ - res = [] +def getEdamUrisFromLabels(edam_labels) -> list: + """ + Get EDAM URIs from EDAM labels. + """ + + res = [] - for lab in edam_labels: - query=""" + for lab in edam_labels: + query = """ PREFIX edam: PREFIX rdfs: SELECT ?label ?entity WHERE { ?entity rdfs:label '%s' . } - """%(lab) + """ % (lab) - q = kg.query(query) - for r in q: - # uri = r['entity'] - uri = r['entity'].rsplit('/', 1)[-1] - res.append(f'{uri}') + q = kg.query(query) + for r in q: + # uri = r['entity'] + uri = r["entity"].rsplit("/", 1)[-1] + res.append(f"{uri}") + + return res - return res def rdfize(data) -> Graph: prefix = """ @@ -49,27 +51,27 @@ def rdfize(data) -> Graph: triples = "" - name = None # OK - desc = None # OK - url = None # OK - #owner = None # Suite_owner -> author, contributor, primaryContact? - version = None # OK + name = None # OK + desc = None # OK + url = None # OK + # owner = None # Suite_owner -> author, contributor, primaryContact? + version = None # OK - biotools_id = None # OK - #biii_id = None # biii_ID - bioconda_id = None # OK + biotools_id = None # OK + # biii_id = None # biii_ID + bioconda_id = None # OK - edam_operations = [] # OK - edam_topics = [] # OK - keywords = [] # OK - #help = [] # Related_Tutorials -> many many GTN links - #workflows = [] # Related_Workflows : 'link' (ex. many WfHub or usegalaxy refs) + edam_operations = [] # OK + edam_topics = [] # OK + keywords = [] # OK + # help = [] # Related_Tutorials -> many many GTN links + # workflows = [] # Related_Workflows : 'link' (ex. many WfHub or usegalaxy refs) - #biotools_name = None - #biotools_desc = None + # biotools_name = None + # biotools_desc = None - #Tool_ids = [] # see for ex. bedtools suite ++ - #Suite_source + # Tool_ids = [] # see for ex. bedtools suite ++ + # Suite_source if "Suite_ID" in data.keys(): name = data["Suite_ID"] @@ -82,11 +84,13 @@ def rdfize(data) -> Graph: if "bio.tool_ID" in data.keys(): biotools_id = "biotools:" + data["bio.tool_ID"] if "Suite_conda_package" in data.keys() and data["Suite_conda_package"]: - bioconda_id = "bioconda:" + data["Suite_conda_package"].strip() # see pharokka package bioconda ID + bioconda_id = ( + "bioconda:" + data["Suite_conda_package"].strip() + ) # see pharokka package bioconda ID if "EDAM_operations" in data.keys(): - #for operation in data["EDAM_operations"]: - #op = getEdamUrisFromSingleLabel(operation) + # for operation in data["EDAM_operations"]: + # op = getEdamUrisFromSingleLabel(operation) ope = getEdamUrisFromLabels(data["EDAM_operations"]) for o in ope: edam_operations.append("edam:" + o) @@ -100,19 +104,19 @@ def rdfize(data) -> Graph: for keyword in data["ToolShed_categories"]: keywords.append(keyword) - #if "bio.tool_description" in data.keys(): - #biotools_desc = data["bio.tool_description"] - #if "bio.tool_name" in data.keys(): - #name = data["bio.tool_name"] + # if "bio.tool_description" in data.keys(): + # biotools_desc = data["bio.tool_description"] + # if "bio.tool_name" in data.keys(): + # name = data["bio.tool_name"] try: if name: - package_uri = f'galaxy:{name}' - triples += f'{package_uri} rdf:type schema:SoftwareApplication .\n' + package_uri = f"galaxy:{name}" + triples += f"{package_uri} rdf:type schema:SoftwareApplication .\n" triples += f'{package_uri} schema:name "{name}" .\n' if desc: - triples += f'''{package_uri} schema:description """{desc}""" .\n''' # see package infernal for ex. of special characters issue + triples += f'''{package_uri} schema:description """{desc}""" .\n''' # see package infernal for ex. of special characters issue if url: triples += f'{package_uri} schema:url "{url}" .\n' if version: @@ -125,21 +129,22 @@ def rdfize(data) -> Graph: triples += f'{package_uri} schema:identifier "{bioconda_id}" .\n' for ope in edam_operations: - triples += f'{package_uri} schema:featureList {ope} .\n' + triples += f"{package_uri} schema:featureList {ope} .\n" for top in edam_topics: triples += f'{package_uri} schema:applicationSubCategory "{top}" .\n' for key in keywords: triples += f'{package_uri} schema:keywords "{key}" .\n' g = Graph() - g.parse(data=prefix+"\n"+triples, format="turtle") - print(g.serialize(format='turtle')) + g.parse(data=prefix + "\n" + triples, format="turtle") + print(g.serialize(format="turtle")) return g except Exception as e: print("PARSING ERROR for:") - print(prefix+"\n"+triples) - raise(e) + print(prefix + "\n" + triples) + raise (e) + def get_galaxy_files_in_repo(): tools = [] @@ -157,7 +162,7 @@ def process_tools_by_id(id="SPROUT"): for tool_file in tool_files: if id in tool_file: path = Path(tool_file) - # #tool = yaml.safe_load(path.read_text(encoding="utf-8")) + # #tool = yaml.safe_load(path.read_text(encoding="utf-8")) tool = json.loads(path.read_text(encoding="utf-8")) tool_id = None diff --git a/biotools-import/import.py b/biotools-import/import.py index edf3d4f..2ebea92 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -1,6 +1,6 @@ import json +from json import JSONDecodeError import os -import sys import glob import argparse @@ -10,6 +10,7 @@ BIOTOOLS_DOMAIN = "https://bio.tools" SSL_VERIFY = True + def clean(): for data_file in glob.glob(r"data/*/*.biotools.json"): os.remove(data_file) @@ -31,14 +32,14 @@ def retrieve(filters=None): f"{BIOTOOLS_DOMAIN}/api/tool/", params=parameters, headers={"Accept": "application/json"}, - verify=SSL_VERIFY + verify=SSL_VERIFY, ) try: entry = response.json() - except JSONDecodeError as e: - print("Json decode error for " + str(req.data.decode("utf-8"))) + except JSONDecodeError: + print("Json decode error for " + str(response.content.decode("utf-8"))) break - has_next_page = entry["next"] != None + has_next_page = entry["next"] is not None for tool in entry["list"]: tool_id = tool["biotoolsID"] @@ -46,11 +47,20 @@ def retrieve(filters=None): directory = os.path.join("data", tpe_id) if not os.path.isdir(directory): os.mkdir(directory) - with open(os.path.join(directory, tpe_id + ".biotools.json"), "w") as write_file: - drop_false = lambda path, key, value: bool(value) + with open( + os.path.join(directory, tpe_id + ".biotools.json"), "w" + ) as write_file: + + def drop_false(path, key, value): + return bool(value) + tool_cleaned = remap(tool, visit=drop_false) json.dump( - tool_cleaned, write_file, sort_keys=True, indent=4, separators=(",", ": ") + tool_cleaned, + write_file, + sort_keys=True, + indent=4, + separators=(",", ": "), ) nb_tools += 1 print(f"import tool #{nb_tools}: {tool_id} in folder {directory}") diff --git a/debian-med-import/import.py b/debian-med-import/import.py index 95749db..2f5901c 100644 --- a/debian-med-import/import.py +++ b/debian-med-import/import.py @@ -1,6 +1,5 @@ # coding: utf-8 import argparse -import json import logging from pathlib import Path import os @@ -11,6 +10,7 @@ yaml = YAML() + def clean(base_path): import_directory = os.path.join(base_path, "imports", "debian-med") biotools_directory = os.path.join(base_path, "data") @@ -20,19 +20,18 @@ def clean(base_path): for data_file in Path(biotools_directory).glob("*/*.debian.yaml"): os.remove(data_file) + def process_data(base_path): """Query UDD for debian-med packages and write them to YAML files in `import/debian-med`, plus in `data` if a biotools cross-link exists.""" import_directory = os.path.join(base_path, "imports", "debian-med") biotools_directory = os.path.join(base_path, "data") rootLogger = logging.getLogger() rootLogger.setLevel(logging.INFO) - fileHandler = logging.FileHandler('debian_import.log') + fileHandler = logging.FileHandler("debian_import.log") rootLogger.addHandler(fileHandler) consoleHandler = logging.StreamHandler() rootLogger.addHandler(consoleHandler) - rootLogger.info( - "starting debian med metadata import from UDD..." - ) + rootLogger.info("starting debian med metadata import from UDD...") connection = psycopg2.connect( user="udd-mirror", password="udd-mirror", @@ -121,9 +120,7 @@ def process_data(base_path): package = item["package"] release = item["release"] description_md5 = item["description_md5"] - rootLogger.info( - f"processing package {package}" - ) + rootLogger.info(f"processing package {package}") query_registries = f"select array_to_json(array_agg(t)) from (select entry, name from registry where source = '{package_source}') t" cursor_loop.execute(query_registries) registries_data = cursor_loop.fetchone()[0] @@ -142,15 +139,17 @@ def process_data(base_path): if package == package_source: if biotools is None: rootLogger.warning(f"package '{package_source}' has no bio.tools ref.") - biotools_xref = False + biotools_xref = False else: - biotools_package_directory = os.path.join(biotools_directory, biotools.lower()) + biotools_package_directory = os.path.join( + biotools_directory, biotools.lower() + ) p = Path(biotools_package_directory) if not p.is_dir(): rootLogger.warning( f"package '{package_source}' has a biotools ref ('{biotools}') but no folder exists." ) - biotools_xref = False + biotools_xref = False else: rootLogger.warning( f"package name '{package}' is different from package source name '{package_source}', skipping." @@ -185,20 +184,23 @@ def process_data(base_path): cursor_loop.execute(query_descr) descr_data = cursor_loop.fetchone()[0] item["descr"] = descr_data - drop_false = lambda path, key, value: bool(value) + + def drop_false(path, key, value): + return bool(value) + item = remap(item, visit=drop_false) file_path = os.path.join(import_directory, f"{item['package']}.debian.yaml") with open(file_path, "w") as fh: yaml.dump(item, fh) if biotools_xref: - file_path = os.path.join(biotools_package_directory, f"{item['package']}.debian.yaml") + file_path = os.path.join( + biotools_package_directory, f"{item['package']}.debian.yaml" + ) with open(file_path, "w") as fh: yaml.dump(item, fh) cursor_loop.close() connection.close() - rootLogger.info( - "finished debian med metadata import from UDD." - ) + rootLogger.info("finished debian med metadata import from UDD.") def get_parser(): diff --git a/galaxytool-import/galaxytool-import.py b/galaxytool-import/galaxytool-import.py index 6ebc3b2..e5ee60d 100644 --- a/galaxytool-import/galaxytool-import.py +++ b/galaxytool-import/galaxytool-import.py @@ -3,7 +3,6 @@ import os import requests -from boltons.iterutils import remap GALAXY_ALL_TOOLS_METADATA = "https://raw.githubusercontent.com/galaxyproject/galaxy_codex/refs/heads/main/communities/all/resources/tools.json" GALAXY_ALL_WORKFLOWS_METADATA = "https://raw.githubusercontent.com/galaxyproject/galaxy_codex/refs/heads/main/communities/all/resources/workflows.json" @@ -91,9 +90,10 @@ def retrieve(): directory = os.path.join("data", tpe_id) if os.path.isdir(directory): data_save_path = os.path.join(directory, f"{tpe_id}.galaxy.json") - with open(save_path, "rb") as f_src, open( - data_save_path, "wb" - ) as f_dst: + with ( + open(save_path, "rb") as f_src, + open(data_save_path, "wb") as f_dst, + ): f_dst.write(f_src.read()) print(f"copy tool #{nb_tools} to data folder: {tpe_id}") diff --git a/openebench-import/openebench-import.py b/openebench-import/openebench-import.py index ff3f7b1..449d0b2 100644 --- a/openebench-import/openebench-import.py +++ b/openebench-import/openebench-import.py @@ -39,9 +39,9 @@ def main(): oeb_id = tokens[0] if len(tokens) == 1 else tokens[1] tool_dir = TOOLS_CONTENT_PATH + oeb_id - if tool_dir != None and os.path.isdir(tool_dir): + if tool_dir is not None and os.path.isdir(tool_dir): metrics_list = git_metrics.get(tool_dir) - if metrics_list != None: + if metrics_list is not None: metrics_list.append(m) else: git_metrics[tool_dir] = [m] @@ -60,7 +60,7 @@ def get_metrics(): data = res.read() return json.loads(data) - print("error reading metrics", req) + print("error reading metrics", res.getcode()) if __name__ == "__main__": diff --git a/scripts/biotools-pullrequest-bot/biotools_pullrequest_analyzer.py b/scripts/biotools-pullrequest-bot/biotools_pullrequest_analyzer.py index 478887a..6225e66 100644 --- a/scripts/biotools-pullrequest-bot/biotools_pullrequest_analyzer.py +++ b/scripts/biotools-pullrequest-bot/biotools_pullrequest_analyzer.py @@ -8,16 +8,27 @@ class readable_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): prospective_dir = values if not os.path.isdir(prospective_dir): - raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir)) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a valid path".format(prospective_dir) + ) if os.access(prospective_dir, os.R_OK): setattr(namespace, self.dest, prospective_dir) else: - raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir)) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a readable dir".format(prospective_dir) + ) -parser = argparse.ArgumentParser(description='this script will return the biotools difference between branches', - fromfile_prefix_chars="@") -parser.add_argument("path", help="path to metadata dir, e.g. /content/data/", type=str, action=readable_dir) +parser = argparse.ArgumentParser( + description="this script will return the biotools difference between branches", + fromfile_prefix_chars="@", +) +parser.add_argument( + "path", + help="path to metadata dir, e.g. /content/data/", + type=str, + action=readable_dir, +) parser.add_argument("branch1", help="name of branch 1", type=str) parser.add_argument("branch2", help="name of branch 2", type=str) args = parser.parse_args() @@ -27,10 +38,10 @@ def get_changeg_biotools(branch1, branch2, path): added_count = 0 modified_count = 0 deleted_count = 0 - format = '--name-status' + format = "--name-status" files = [] g = git.Git(path) - differ = g.diff('%s..%s' % (branch1, branch2), format).split("\n") + differ = g.diff("%s..%s" % (branch1, branch2), format).split("\n") for line in differ: if len(line): files.append(line) @@ -40,17 +51,25 @@ def get_changeg_biotools(branch1, branch2, path): file = os.path.basename(output_duple[1]) dirname = os.path.basename(os.path.dirname(output_duple[1])) - if (dirname + '.json' == file): - if (output_duple[0] == 'A'): + if dirname + ".json" == file: + if output_duple[0] == "A": added_count += 1 - elif (output_duple[0] == 'D'): + elif output_duple[0] == "D": deleted_count += 1 - elif (output_duple[0] == 'M'): + elif output_duple[0] == "M": modified_count += 1 - return {'modified': modified_count, 'added': added_count, 'deleted': deleted_count} + return {"modified": modified_count, "added": added_count, "deleted": deleted_count} statistics = get_changeg_biotools(args.branch1, args.branch2, args.path) message = " Differences in biotools between this PR and original bio-tools/content master branch! \n" message += " BioTools affected:\n" -print(message, "added:", statistics['added'], "modified:", statistics['modified'], "deleted:", statistics['deleted']) +print( + message, + "added:", + statistics["added"], + "modified:", + statistics["modified"], + "deleted:", + statistics["deleted"], +) diff --git a/scripts/doi-collector/doi_collector.py b/scripts/doi-collector/doi_collector.py index 33eee98..d7b1f7d 100644 --- a/scripts/doi-collector/doi_collector.py +++ b/scripts/doi-collector/doi_collector.py @@ -9,23 +9,36 @@ class readable_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): prospective_dir = values if not os.path.isdir(prospective_dir): - raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir)) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a valid path".format(prospective_dir) + ) if os.access(prospective_dir, os.R_OK): setattr(namespace, self.dest, prospective_dir) else: - raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir)) - -parser = argparse.ArgumentParser(description='test', fromfile_prefix_chars="@") -parser.add_argument("path", help="path to metadata dir, e.g. /content/data/", type=str, action=readable_dir) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a readable dir".format(prospective_dir) + ) + + +parser = argparse.ArgumentParser(description="test", fromfile_prefix_chars="@") +parser.add_argument( + "path", + help="path to metadata dir, e.g. /content/data/", + type=str, + action=readable_dir, +) args = parser.parse_args() def enrich_dois(path): dirname = os.path.basename(os.path.normpath(path)) - file_types = {'json': '.json', 'yaml': '.yaml', 'debian': '.debian.yaml'} - files = ["bioconda_" + dirname + file_types['yaml'], dirname + file_types['json'], - dirname + file_types['debian']] - files = [path + '/' + file for file in files if os.path.exists(path + '/' + file)] + file_types = {"json": ".json", "yaml": ".yaml", "debian": ".debian.yaml"} + files = [ + "bioconda_" + dirname + file_types["yaml"], + dirname + file_types["json"], + dirname + file_types["debian"], + ] + files = [path + "/" + file for file in files if os.path.exists(path + "/" + file)] all_doies = set() json_dois = set() @@ -35,8 +48,8 @@ def enrich_dois(path): non_primary_dois = set() def parse_yaml(file): - with open(file, 'r') as stream: - return yaml.safe_load(stream) + with open(file, "r") as stream: + return yaml.safe_load(stream) def parse_json(file): with open(file) as f: @@ -44,14 +57,22 @@ def parse_json(file): def extract_doi_from_json(parsed_json): # if publication doesn't exist, return empty array - if 'publication' in parsed_json: - publications = parsed_json['publication'] + if "publication" in parsed_json: + publications = parsed_json["publication"] dois = set() for publication in publications: - if ('type' in publication and 'doi' in publication and publication['type'] == 'Primary'): - dois.add(publication['doi']) - elif ('type' in publication and 'doi' in publication and publication['type'] != 'Primary'): - non_primary_dois.add(publication['doi']) + if ( + "type" in publication + and "doi" in publication + and publication["type"] == "Primary" + ): + dois.add(publication["doi"]) + elif ( + "type" in publication + and "doi" in publication + and publication["type"] != "Primary" + ): + non_primary_dois.add(publication["doi"]) return dois else: return [] @@ -59,15 +80,31 @@ def extract_doi_from_json(parsed_json): def get_doi_identifiers_from_yaml(parsed_yaml, isDebian=False): if isDebian: # if doi entry exist return it, otherwise empty array - print([item['value'] for item in parsed_yaml.get('bib', []) if item["key"]=="doi"]) - return [item['value'] for item in parsed_yaml.get('bib', []) if item["key"]=="doi"] - if 'identifiers' in parsed_yaml: - return filter(lambda identifier: 'doi' in identifier, parsed_yaml['identifiers']) + print( + [ + item["value"] + for item in parsed_yaml.get("bib", []) + if item["key"] == "doi" + ] + ) + return [ + item["value"] + for item in parsed_yaml.get("bib", []) + if item["key"] == "doi" + ] + if "identifiers" in parsed_yaml: + return filter( + lambda identifier: "doi" in identifier, parsed_yaml["identifiers"] + ) def extract_doi_from_bioconda_yaml(parsed_yaml): # return it as list, so we can use len() return list( - map(lambda identifier_doi: identifier_doi.split(':')[1], get_doi_identifiers_from_yaml(parsed_yaml))) + map( + lambda identifier_doi: identifier_doi.split(":")[1], + get_doi_identifiers_from_yaml(parsed_yaml), + ) + ) def extract_doi_from_debian(parsed_yaml): dois = list(get_doi_identifiers_from_yaml(parsed_yaml, True)) @@ -79,36 +116,36 @@ def extract_doi_from_debian(parsed_yaml): def write_dois_json(json_dois, file): parsed_json = parse_json(file) json_dois = json_dois.difference(non_primary_dois) - if 'publication' not in parsed_json: - parsed_json['publication'] = [{'doi': json_dois.pop(), 'type': 'Primary'}] + if "publication" not in parsed_json: + parsed_json["publication"] = [{"doi": json_dois.pop(), "type": "Primary"}] - publications = parsed_json['publication'] + publications = parsed_json["publication"] for absent_doi in json_dois: - publications.append({'doi': absent_doi, 'type': 'Primary'}) - parsed_json['publication'] = publications - with open(file, 'w', encoding='utf-8') as outfile: + publications.append({"doi": absent_doi, "type": "Primary"}) + parsed_json["publication"] = publications + with open(file, "w", encoding="utf-8") as outfile: json.dump(parsed_json, outfile, indent=4) def write_dois_bioconda(dois, file): parsed_yaml = parse_yaml(file) for doi in dois: - parsed_yaml['identifiers'].append('doi:' + str(doi)) - with open(file, 'w') as outfile: + parsed_yaml["identifiers"].append("doi:" + str(doi)) + with open(file, "w") as outfile: yaml.safe_dump(parsed_yaml, outfile, default_flow_style=False) def write_dois_debian(dois, file): parsed_yaml = parse_yaml(file) - identifiers = parsed_yaml['identifiers'] + identifiers = parsed_yaml["identifiers"] # if doi doesn't exist - if 'doi' not in identifiers: - identifiers['doi'] = [dois.pop()] + if "doi" not in identifiers: + identifiers["doi"] = [dois.pop()] for doi in dois: - identifiers['doi'].append(doi) + identifiers["doi"].append(doi) - parsed_yaml['identifiers'] = identifiers + parsed_yaml["identifiers"] = identifiers - with open(file, 'w') as outfile: + with open(file, "w") as outfile: # do something fancy like indentation/mapping/sequence/offset yaml.safe_dump(parsed_yaml, outfile, default_flow_style=False) @@ -119,12 +156,12 @@ def enrich_files(method, dois, file): for file in files: dois = None - if file.endswith(file_types['json']): + if file.endswith(file_types["json"]): parsed_json = parse_json(file) dois = extract_doi_from_json(parsed_json) json_dois = dois - elif file.endswith(file_types['debian']): + elif file.endswith(file_types["debian"]): try: parsed_yaml = parse_yaml(file) except yaml.YAMLError as exc: @@ -135,7 +172,7 @@ def enrich_files(method, dois, file): dois = extract_doi_from_debian(parsed_yaml) debian_dois = dois - elif file.endswith(file_types['yaml']): + elif file.endswith(file_types["yaml"]): parsed_yaml = parse_yaml(file) dois = extract_doi_from_bioconda_yaml(parsed_yaml) bioconda_dois = dois @@ -143,13 +180,14 @@ def enrich_files(method, dois, file): if len(all_doies) > 0: for file in files: - if file.endswith(file_types['json']): + if file.endswith(file_types["json"]): enrich_files(write_dois_json, json_dois, file) - elif file.endswith(file_types['debian']): + elif file.endswith(file_types["debian"]): enrich_files(write_dois_debian, debian_dois, file) - elif file.endswith(file_types['yaml']): + elif file.endswith(file_types["yaml"]): enrich_files(write_dois_bioconda, bioconda_dois, file) + [enrich_dois(f.path) for f in os.scandir(args.path) if f.is_dir()] diff --git a/scripts/files-validator/debian_validator.py b/scripts/files-validator/debian_validator.py index a0e9152..68fbeca 100644 --- a/scripts/files-validator/debian_validator.py +++ b/scripts/files-validator/debian_validator.py @@ -8,31 +8,41 @@ class readable_dir(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): prospective_dir = values if not os.path.isdir(prospective_dir): - raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir)) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a valid path".format(prospective_dir) + ) if os.access(prospective_dir, os.R_OK): setattr(namespace, self.dest, prospective_dir) else: - raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir)) - - -parser = argparse.ArgumentParser(description='test', fromfile_prefix_chars="@") -parser.add_argument("path", help="path to metadata dir, e.g. /content/data/", type=str, action=readable_dir) + raise argparse.ArgumentTypeError( + "readable_dir:{0} is not a readable dir".format(prospective_dir) + ) + + +parser = argparse.ArgumentParser(description="test", fromfile_prefix_chars="@") +parser.add_argument( + "path", + help="path to metadata dir, e.g. /content/data/", + type=str, + action=readable_dir, +) args = parser.parse_args() def validate_debian_files(path): dirname = os.path.basename(os.path.normpath(path)) - file = path + '/' + dirname + '.debian.yaml' + file = path + "/" + dirname + ".debian.yaml" global valid_files_counter + def parse_yaml(file): - with open(file, 'r') as stream: + with open(file, "r") as stream: return yaml.safe_load(stream) - if os.path.exists(file) and file.endswith('.debian.yaml'): + if os.path.exists(file) and file.endswith(".debian.yaml"): try: parse_yaml(file) valid_files_counter += 1 - except yaml.YAMLError as exc: + except yaml.YAMLError: print("unable to parse " + file, file=sys.stderr) invalid_files.append(file) diff --git a/scripts/runbiotools/gh2biotools.py b/scripts/runbiotools/gh2biotools.py index f8f1e10..8bc1763 100644 --- a/scripts/runbiotools/gh2biotools.py +++ b/scripts/runbiotools/gh2biotools.py @@ -6,42 +6,49 @@ import requests from boltons.iterutils import remap -HEADERS = {'Content-Type': 'application/json', 'Accept': 'application/json'} -HOST = 'https://bio.tools' -TOOL_API_URL = f'{HOST}/api/tool/' +HEADERS = {"Content-Type": "application/json", "Accept": "application/json"} +HOST = "https://bio.tools" +TOOL_API_URL = f"{HOST}/api/tool/" -logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.INFO) def validate_upload_tool(tool, headers): - url = f'{HOST}/api/tool/validate/' + url = f"{HOST}/api/tool/validate/" response = requests.post(url, headers=headers, data=json.dumps(tool)) if not response.ok: - logging.error(f"Error validating upload for {tool['biotoolsID']}: {response.status_code} {response.text}") + logging.error( + f"Error validating upload for {tool['biotoolsID']}: {response.status_code} {response.text}" + ) return response.ok + def upload_tool(tool, headers): url = TOOL_API_URL response = requests.post(url, headers=headers, data=json.dumps(tool)) return response.ok + def validate_update_tool(tool, tool_id, headers): - url = f'{HOST}/api/{tool_id}/validate/' + url = f"{HOST}/api/{tool_id}/validate/" response = requests.put(url, headers=headers, data=json.dumps(tool)) if not response.ok: - logging.error(f"Error validating update for {tool['biotoolsID']}: {response.status_code} {response.text}") + logging.error( + f"Error validating update for {tool['biotoolsID']}: {response.status_code} {response.text}" + ) return response.ok + def update_tool(tool, headers): """Updates an existing tool on bio.tools.""" url = f"{TOOL_API_URL}{tool['biotoolsID']}/" response = requests.put(url, headers=headers, data=json.dumps(tool)) return response.ok - + def process_single_file(file, headers): """ @@ -55,7 +62,7 @@ def process_single_file(file, headers): if not tool_id: logging.error(f"'biotoolsID' not found in {file}") return "UNKNOWN", "failed" - + # check if tool exists tool_url = f"{HOST}/api/tool/{tool_id}/" response = requests.get(tool_url, headers=headers) @@ -78,7 +85,7 @@ def process_single_file(file, headers): elif response.status_code == 404: # tool not registered, proceed with upload - logging.info(f'Tool {tool_id} not registered, proceeding with upload') + logging.info(f"Tool {tool_id} not registered, proceeding with upload") valid = validate_upload_tool(payload_dict, headers) if not valid: @@ -87,13 +94,14 @@ def process_single_file(file, headers): success = upload_tool(payload_dict, headers) return tool_id, "uploaded" if success else "failed_upload" - + else: - logging.error(f"Error retrieving tool {tool_id}: {response.status_code} {response.text}") + logging.error( + f"Error retrieving tool {tool_id}: {response.status_code} {response.text}" + ) return tool_id, "failed" - def print_summary(results): """Print a summary of the upload results.""" logging.info("---------------------------") @@ -103,56 +111,67 @@ def print_summary(results): logging.info(f"Tools unchanged: {len(results['unchanged'])}") logging.info(f"Tools failed: {len(results['failed'])}") logging.info(f"Tools failed validation: {len(results['failed_validation'])}") - logging.info(f"Tools failed upload after validation: {len(results['failed_upload'])}") - logging.info(f"Tools failed update after validation: {len(results['failed_update'])}") - - if results['uploaded']: + logging.info( + f"Tools failed upload after validation: {len(results['failed_upload'])}" + ) + logging.info( + f"Tools failed update after validation: {len(results['failed_update'])}" + ) + + if results["uploaded"]: logging.info(f"Uploaded tools: {', '.join(results['uploaded'])}") - if results['updated']: + if results["updated"]: logging.info(f"Updated tools: {', '.join(results['updated'])}") - if results['failed']: + if results["failed"]: logging.error(f"Failed tools: {', '.join(results['failed'])}") - if results['failed_validation']: - logging.error(f"Failed validation tools: {', '.join(results['failed_validation'])}") - if results['failed_upload']: + if results["failed_validation"]: + logging.error( + f"Failed validation tools: {', '.join(results['failed_validation'])}" + ) + if results["failed_upload"]: logging.error(f"Failed upload tools: {', '.join(results['failed_upload'])}") - if results['failed_update']: + if results["failed_update"]: logging.error(f"Failed update tools: {', '.join(results['failed_update'])}") - def run_upload(files): - token = os.environ.get('BIOTOOLS_API_TOKEN') + token = os.environ.get("BIOTOOLS_API_TOKEN") if not token: - logging.error('Missing BIOTOOLS_API_TOKEN. Aborting upload.') + logging.error("Missing BIOTOOLS_API_TOKEN. Aborting upload.") raise SystemExit(1) - - headers = {**HEADERS, 'Authorization': f'Token {token}'} + + headers = {**HEADERS, "Authorization": f"Token {token}"} results = { - 'uploaded': [], - 'updated': [], - 'unchanged': [], - 'failed': [], - 'failed_validation': [], - 'failed_upload': [], - 'failed_update': [] + "uploaded": [], + "updated": [], + "unchanged": [], + "failed": [], + "failed_validation": [], + "failed_upload": [], + "failed_update": [], } - for json in files: - with open(json, 'r') as file: + for json_file in files: + with open(json_file, "r") as file: tool_id, status = process_single_file(file, headers) results[status].append(tool_id) print_summary(results) - if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Sync changed .biotools.json files with bio.tools server') + parser = argparse.ArgumentParser( + description="Sync changed .biotools.json files with bio.tools server" + ) + + parser.add_argument( + "--files", + metavar="F", + type=str, + nargs="+", + help="List of changed/created .biotools.json files to process", + ) - parser.add_argument('--files', metavar='F', type=str, nargs='+', - help='List of changed/created .biotools.json files to process') - args = parser.parse_args() if args.files: diff --git a/scripts/runbiotools/tools_upload.py b/scripts/runbiotools/tools_upload.py index 3635467..62cd474 100644 --- a/scripts/runbiotools/tools_upload.py +++ b/scripts/runbiotools/tools_upload.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -from datetime import datetime import glob import json import logging @@ -9,53 +8,67 @@ from bs4 import BeautifulSoup from boltons.iterutils import remap -HEADERS = {'Content-Type': 'application/json', 'Accept': 'application/json'} -HOST = 'http://localhost:8000/' +HEADERS = {"Content-Type": "application/json", "Accept": "application/json"} +HOST = "http://localhost:8000/" + def login(user, password): - payload = {'username':user,'password':password} - response = requests.post(HOST+'api/rest-auth/login/', headers=HEADERS, json=payload) - token = response.json()['key'] + payload = {"username": user, "password": password} + response = requests.post( + HOST + "api/rest-auth/login/", headers=HEADERS, json=payload + ) + token = response.json()["key"] return token + def run_upload(token, user): headers = HEADERS - headers.update({'Authorization':f'Token {token}'}) + headers.update({"Authorization": f"Token {token}"}) print(token) - url = HOST + '/api/tool/validate/' - #register tools + url = HOST + "/api/tool/validate/" + # register tools tools_ok = [] tools_ko = [] - for biotools_json_file in glob.glob('../content/data/*/*.biotools.json'): + for biotools_json_file in glob.glob("../content/data/*/*.biotools.json"): try: - logging.debug(f'uploading {biotools_json_file}...') - payload_dict=json.load(open(biotools_json_file)) + logging.debug(f"uploading {biotools_json_file}...") + payload_dict = json.load(open(biotools_json_file)) payload_dict["editPermission"]["authors"] = [user] - payload_dict = remap(payload_dict, lambda p, k, v: k != 'term') + payload_dict = remap(payload_dict, lambda p, k, v: k != "term") response = requests.post(url, headers=headers, json=payload_dict) response.raise_for_status() tools_ok.append(payload_dict["biotoolsID"]) logging.debug(response.json()) - logging.debug(f'done uploading {biotools_json_file}') + logging.debug(f"done uploading {biotools_json_file}") except requests.exceptions.HTTPError: - if response.status_code==500: + if response.status_code == 500: soup = BeautifulSoup(response.text, "html.parser") - messages = "; ".join([','.join(error_el.contents) for error_el in soup.find_all(class_='exception_value')]) + messages = "; ".join( + [ + ",".join(error_el.contents) + for error_el in soup.find_all(class_="exception_value") + ] + ) else: messages = response.text - logging.error(f'error while uploading {biotools_json_file} (status {response.status_code}): {messages}') + logging.error( + f"error while uploading {biotools_json_file} (status {response.status_code}): {messages}" + ) tools_ko.append(payload_dict["biotoolsID"]) - except: - logging.error(f'error while uploading {biotools_json_file}', exc_info=True) + except Exception: + logging.error(f"error while uploading {biotools_json_file}", exc_info=True) tools_ko.append(payload_dict["biotoolsID"]) - logging.error('Tools upload finished') + logging.error("Tools upload finished") logging.error(f"Tools OK: {len(tools_ok)}") logging.error(f"Tools KO: {len(tools_ko)}") + if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Bulk upload github tools to a test bio.tools server') - parser.add_argument('login', type=str, help='bio.tools login') - parser.add_argument('password', type=str, help='bio.tools password') + parser = argparse.ArgumentParser( + description="Bulk upload github tools to a test bio.tools server" + ) + parser.add_argument("login", type=str, help="bio.tools login") + parser.add_argument("password", type=str, help="bio.tools password") args = parser.parse_args() token = login(args.login, args.password) run_upload(token, args.login) diff --git a/scripts/stats/ecosystem.py b/scripts/stats/ecosystem.py index fa73788..a4f7e49 100644 --- a/scripts/stats/ecosystem.py +++ b/scripts/stats/ecosystem.py @@ -125,7 +125,7 @@ def generate_report(self, report_path=tempfile.gettempdir()): pretty_index.append( [ "No " + summary_df.index.names[cell_idx] - if cell == False + if not cell else summary_df.index.names[cell_idx] for cell_idx, cell in enumerate(idx_row) ]