From 9296c416b955e67f6757b8e7c9eaeb5c6a8e4e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Correia?= Date: Tue, 15 Nov 2022 16:10:15 +0000 Subject: [PATCH 1/6] [FIX] _match_conditions returns False if smiles is None --- src/biocatalyzer/bioreactor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/biocatalyzer/bioreactor.py b/src/biocatalyzer/bioreactor.py index 730af4a..19d99e0 100644 --- a/src/biocatalyzer/bioreactor.py +++ b/src/biocatalyzer/bioreactor.py @@ -498,6 +498,8 @@ def _match_conditions(self, smiles: str): bool True if mol matches conditions to remove, False otherwise. """ + if smiles is None: + return False if '*' in smiles: return False if self._min_atom_count > 0: From aee55c9505d413081760a9bf7141776835a43710 Mon Sep 17 00:00:00 2001 From: jcorreia11 Date: Tue, 11 Feb 2025 11:05:03 +0000 Subject: [PATCH 2/6] [ADD] use pathlib --- src/biocatalyzer/bioreactor.py | 50 ++++++++++++++----------- src/biocatalyzer/clis/cli.py | 8 ++-- src/biocatalyzer/clis/cli_bioreactor.py | 8 ++-- src/biocatalyzer/io_utils/loaders.py | 24 +++++++----- src/biocatalyzer/matcher.py | 36 +++++++++--------- 5 files changed, 70 insertions(+), 56 deletions(-) diff --git a/src/biocatalyzer/bioreactor.py b/src/biocatalyzer/bioreactor.py index 0141502..cfc716a 100644 --- a/src/biocatalyzer/bioreactor.py +++ b/src/biocatalyzer/bioreactor.py @@ -1,10 +1,10 @@ import itertools import logging import multiprocessing -import os import time import uuid from typing import Union +from pathlib import Path import pandas as pd from tqdm import tqdm @@ -13,7 +13,7 @@ from biocatalyzer.chem import ChemUtils from biocatalyzer.io_utils import Loaders -DATA_FILES = os.path.dirname(__file__) +DATA_FILES = Path(__file__).resolve().parent class BioReactor: @@ -59,7 +59,6 @@ def __init__(self, # silence RDKit logger ChemUtils.rdkit_logs(False) self._compounds_path = compounds_path - self._output_path = output_path self._neutralize = neutralize_compounds self._organisms_path = organisms_path self._reaction_rules_path = reaction_rules_path @@ -68,7 +67,7 @@ def __init__(self, self._set_up_files() self._orgs = Loaders.load_organisms(self._organisms_path) self._reaction_rules = Loaders.load_reaction_rules(self._reaction_rules_path, orgs=self._orgs) - self._set_output_path(self._output_path) + self._set_output_path(output_path) self._compounds = Loaders.load_compounds(self._compounds_path, self._neutralize) self._molecules_to_remove = Loaders.load_byproducts_to_remove(self._molecules_to_remove_path) self._patterns_to_remove = Loaders.load_patterns_to_remove(self._patterns_to_remove_path) @@ -77,7 +76,7 @@ def __init__(self, self._n_jobs = multiprocessing.cpu_count() else: self._n_jobs = n_jobs - self._new_compounds_path = os.path.join(self._output_path, 'new_compounds.tsv') + self._new_compounds_path = Path(self._output_path) / 'new_compounds.tsv' self._new_compounds = None @property @@ -393,15 +392,13 @@ def n_jobs(self, n_jobs: int): def _set_up_files(self): if self._reaction_rules_path == 'default': - self._reaction_rules_path = os.path.join( - DATA_FILES, 'data/reactionrules/reaction_rules_biocatalyzer.tsv.bz2') + self._reaction_rules_path = DATA_FILES / 'data/reactionrules/reaction_rules_biocatalyzer.tsv.bz2' if self._molecules_to_remove_path == 'default': - self._molecules_to_remove_path = os.path.join(DATA_FILES, 'data/byproducts_to_remove/byproducts.tsv') + self._molecules_to_remove_path = DATA_FILES / 'data/byproducts_to_remove/byproducts.tsv' if self._patterns_to_remove_path == 'default': - self._patterns_to_remove_path = os.path.join(DATA_FILES, 'data/patterns_to_remove/patterns.tsv') + self._patterns_to_remove_path = DATA_FILES / 'data/patterns_to_remove/patterns.tsv' - @staticmethod - def _set_output_path(output_path: str): + def _set_output_path(self, output_path: str): """ Make the output directory if it does not exist. @@ -410,12 +407,15 @@ def _set_output_path(output_path: str): output_path: str The path to the output directory. """ - if not os.path.exists(output_path): - os.makedirs(output_path) + output_path = Path(output_path) + if not output_path.exists(): + output_path.mkdir(parents=True) else: - if os.path.exists(output_path + '/results.tsv') or os.path.exists(output_path + '/new_compounds.tsv'): - raise FileExistsError(f"Results in {output_path} already exists. Define a different output path so " - f"that previous results are not overwritten.") + if (output_path / "results.tsv").exists() or (output_path / "new_compounds.tsv").exists(): + raise FileExistsError( + f"Results in {output_path} already exist. Define a different output path so that previous results are not overwritten." + ) + self._output_path = output_path def _match_patterns(self, smiles: str): """ @@ -563,10 +563,10 @@ def process_results(self, save: bool = True, overwrite: bool = True): results['EC_Numbers'] = results['EC_Numbers'].apply(lambda x: _merge_fields(x)) if save: if overwrite: - results_file_proc = os.path.join(self._output_path, 'new_compounds.tsv') + results_file_proc = self._output_path / 'new_compounds.tsv' results.to_csv(results_file_proc, sep='\t', index=False) else: - results_file_proc = os.path.join(self._output_path, 'new_compounds_processed.tsv') + results_file_proc = self._output_path / 'new_compounds_processed.tsv' results.to_csv(results_file_proc, sep='\t', index=False) else: results_file_proc = self._new_compounds_path @@ -602,18 +602,24 @@ def _react_single(self, smiles: str, smarts: str): if self._neutralize: most_similar_product = ChemUtils.uncharge_smiles(most_similar_product) ecs = self._get_ec_numbers(smarts_id) + new_compound_data = ( + f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t" + f"{most_similar_product}\t{result}\t{ecs}\n" + ) with open(self._new_compounds_path, 'a') as f: - f.write(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t" - f"{most_similar_product}\t{result}\t{ecs}\n") + f.write(new_compound_data) def react(self): """ Transform reactants into products using the reaction rules. """ t0 = time.time() + header = ( + 'OriginalCompoundID\tOriginalCompoundSmiles\tOriginalReactionRuleID\tNewCompoundID\t' + 'NewCompoundSmiles\tNewReactionSmiles\tEC_Numbers\n' + ) with open(self._new_compounds_path, 'w') as f: - f.write('OriginalCompoundID\tOriginalCompoundSmiles\tOriginalReactionRuleID\tNewCompoundID\t' - 'NewCompoundSmiles\tNewReactionSmiles\tEC_Numbers\n') + f.write(header) params = list(itertools.product(self._compounds.smiles, self._reaction_rules.SMARTS)) with multiprocessing.Pool(self._n_jobs) as pool: pool.starmap(self._react_single, tqdm(params, total=len(params))) diff --git a/src/biocatalyzer/clis/cli.py b/src/biocatalyzer/clis/cli.py index 7a9e6be..390be16 100644 --- a/src/biocatalyzer/clis/cli.py +++ b/src/biocatalyzer/clis/cli.py @@ -1,12 +1,12 @@ import logging -import os +from pathlib import Path import click from biocatalyzer.bioreactor import BioReactor from biocatalyzer.matcher import MSDataMatcher -DATA_FILES = os.path.dirname(__file__) +DATA_FILES = Path(__file__).resolve().parent @click.command() @@ -105,8 +105,8 @@ def biocatalyzer_cli(compounds, logging.basicConfig(filename=f'{output_path}logging.log', level=logging.DEBUG) if reaction_rules is None: logging.info(f"Using default reaction rules file.") - reaction_rules = os.path.join( - DATA_FILES, '../data/reactionrules/reaction_rules_biocatalyzer.tsv.bz2') + reaction_rules = DATA_FILES / "../data/reactionrules/reaction_rules_biocatalyzer.tsv.bz2" + reaction_rules = reaction_rules.resolve() br = BioReactor(compounds_path=compounds, output_path=output_path, reaction_rules_path=reaction_rules, diff --git a/src/biocatalyzer/clis/cli_bioreactor.py b/src/biocatalyzer/clis/cli_bioreactor.py index d999215..22d499c 100644 --- a/src/biocatalyzer/clis/cli_bioreactor.py +++ b/src/biocatalyzer/clis/cli_bioreactor.py @@ -1,11 +1,11 @@ import logging -import os +from pathlib import Path import click from biocatalyzer import BioReactor -DATA_FILES = os.path.dirname(__file__) +DATA_FILES = Path(__file__).resolve().parent @click.command() @@ -82,8 +82,8 @@ def bioreactor_cli(compounds, output_path: Path to the output directory. """ if reaction_rules is None: - reaction_rules = os.path.join( - DATA_FILES, '../data/reactionrules/reaction_rules_biocatalyzer.tsv.bz2') + reaction_rules = DATA_FILES / "../data/reactionrules/reaction_rules_biocatalyzer.tsv.bz2" + reaction_rules = reaction_rules.resolve() br = BioReactor(compounds_path=compounds, output_path=output_path, reaction_rules_path=reaction_rules, diff --git a/src/biocatalyzer/io_utils/loaders.py b/src/biocatalyzer/io_utils/loaders.py index 3c03851..dff406f 100644 --- a/src/biocatalyzer/io_utils/loaders.py +++ b/src/biocatalyzer/io_utils/loaders.py @@ -1,5 +1,7 @@ import logging import os +from pathlib import Path +from typing import Union, List import pandas as pd from rdkit.Chem import MolFromSmarts, MolFromSmiles @@ -30,6 +32,7 @@ def load_compounds(path: str, neutralize: bool = False): pandas dataframe with the compounds to use. """ if Loaders._verify_file(path): + path = Path(path) compounds = pd.read_csv(path, header=0, sep='\t') if 'smiles' not in compounds.columns: raise ValueError('The compounds file must contain a column named "smiles".') @@ -47,7 +50,7 @@ def load_compounds(path: str, neutralize: bool = False): raise FileNotFoundError(f"File {path} not found.") @staticmethod - def load_reaction_rules(path, orgs='ALL'): + def load_reaction_rules(path: str, orgs: Union[str, List[str]] = 'ALL') -> pd.DataFrame: """ Load the reaction rules to use. @@ -65,7 +68,8 @@ def load_reaction_rules(path, orgs='ALL'): """ if not Loaders._verify_file(path): raise FileNotFoundError(f"File {path} not found.") - if path.endswith('.bz2'): + path = Path(path) + if path.suffix == '.bz2': rules = pd.read_csv(path, header=0, sep='\t', compression='bz2') else: rules = pd.read_csv(path, header=0, sep='\t') @@ -87,7 +91,6 @@ def match_org(value, orgs_list): return False if not isinstance(orgs, str): - # TODO: check if adding spontaneous reactions actually makes sense orgs.append('spontaneous_reaction') rules['has_org'] = rules.apply(lambda x: match_org(x['Organisms'], orgs), axis=1) rules = rules[rules['has_org']] @@ -95,7 +98,7 @@ def match_org(value, orgs_list): return rules @staticmethod - def load_organisms(path): + def load_organisms(path: str) -> Union[str, List[str]]: """ Load the organisms to use. @@ -106,17 +109,18 @@ def load_organisms(path): Returns ------- - pd.DataFrame: - pandas dataframe with the organisms to use. + Union[str, List[str]]: + List of organisms identifiers. """ if path is None or path == 'None': return 'ALL' if Loaders._verify_file(path): + path = Path(path) orgs = pd.read_csv(path, header=0, sep='\t') if 'org_id' not in orgs.columns: raise ValueError('The organisms file must contain a column named "org_id".') - logging.info(f'Using {list(orgs.org_id.values)} as the Organisms.') - return list(orgs.org_id.values) + logging.info(f'Using {orgs.org_id.to_list()} as the Organisms.') + return orgs.org_id.to_list() elif len(path.split('.')) > 1: raise FileNotFoundError(f"File {path} not found.") else: @@ -140,6 +144,7 @@ def load_byproducts_to_remove(path): """ if path is None or path == 'None': return [] + path = Path(path) byproducts = pd.read_csv(path, header=0, sep='\t') if 'smiles' not in byproducts.columns: raise ValueError('The molecules to remove file must contain a column named "smiles".') @@ -162,6 +167,7 @@ def load_patterns_to_remove(path): """ if path is None or path == 'None': return [] + path = Path(path) patterns = pd.read_csv(path, header=0, sep='\t') if 'smarts' not in patterns.columns: raise ValueError('The patterns to remove file must contain a column named "smarts".') @@ -182,7 +188,7 @@ def _verify_file(path: str): bool: True if the file exists, False otherwise. """ - if not os.path.exists(path): + if not Path(path).exists(): return False return True diff --git a/src/biocatalyzer/matcher.py b/src/biocatalyzer/matcher.py index 33f2ab4..f8346dc 100644 --- a/src/biocatalyzer/matcher.py +++ b/src/biocatalyzer/matcher.py @@ -2,6 +2,7 @@ import multiprocessing import os import time +from pathlib import Path from typing import Union import pandas as pd @@ -11,7 +12,7 @@ from biocatalyzer.io_utils import Loaders from biocatalyzer._utils import match_value -DATA_FILES = os.path.dirname(__file__) +DATA_FILES = Path(__file__).resolve().parent class MSDataMatcher: @@ -48,8 +49,7 @@ def __init__(self, raise ValueError('The new compounds file is empty!') self._ms_data_path = ms_data_path self._ms_data = Loaders.load_ms_data(self._ms_data_path) - self._output_path = output_path - self._set_output_path(self._output_path) + self._set_output_path(output_path) self._tolerance = tolerance if n_jobs == -1: self._n_jobs = multiprocessing.cpu_count() @@ -80,8 +80,7 @@ def output_path(self, path: str): path: str The output path. """ - self._output_path = path - self._set_output_path(self._output_path) + self._set_output_path(path) if self._matches is not None: logging.warning('Results should be generated again for the new information provided!') @@ -208,9 +207,8 @@ def _set_up_reaction_rules(self): """ Loads the reaction rules data file. """ - self._reaction_rules_path = os.path.join( - DATA_FILES, 'data/reactionrules/all_reaction_rules_forward_no_smarts_duplicates_sample.tsv') - self._reaction_rules = Loaders.load_reaction_rules(self._reaction_rules_path) + self._reaction_rules_path = DATA_FILES / 'data/reactionrules/all_reaction_rules_forward_no_smarts_duplicates_sample.tsv' + self._reaction_rules = Loaders.load_reaction_rules(self._reaction_rules_path.as_posix()) def _set_up_new_compounds(self, path: str): """ @@ -223,8 +221,7 @@ def _set_up_new_compounds(self, path: str): """ self._new_compounds = Loaders.load_new_compounds(path) - @staticmethod - def _set_output_path(output_path: str): + def _set_output_path(self, output_path: str): """ Make the output directory if it does not exist. @@ -233,12 +230,16 @@ def _set_output_path(output_path: str): output_path: str The path to the output directory. """ - if not os.path.exists(output_path): - os.makedirs(output_path) + output_path = Path(output_path) + if not output_path.exists(): + output_path.mkdir(parents=True) else: - if os.path.exists(output_path + '/matches.tsv'): - raise FileExistsError(f"File {output_path} already exists. Define a different output path so that " - f"previous results are not overwritten.") + if (output_path / 'matches.tsv').exists(): + raise FileExistsError( + f"File {output_path / 'matches.tsv'} already exists. Define a different output path so that " + f"previous results are not overwritten." + ) + self._output_path = output_path def _calculate_masses(self): """ @@ -304,8 +305,9 @@ def generate_ms_results(self): """ t0 = time.time() self._matches = self._match_masses() - self._matches.to_csv(self._output_path + '/matches.tsv', sep='\t', index=False) - logging.info(f"Matches saved to {self._output_path}/matches.tsv") + path = self._output_path / '/matches.tsv' + self._matches.to_csv(path, sep='\t', index=False) + logging.info(f"Matches saved to {path.as_posix()}") logging.info(f"{self._matches.shape[0]} matches found!") t1 = time.time() logging.info(f"Time elapsed: {t1 - t0} seconds") From 02d9a5271fc65b9659ee0d7aa73dcc846c573bbf Mon Sep 17 00:00:00 2001 From: jcorreia11 Date: Tue, 11 Feb 2025 11:46:26 +0000 Subject: [PATCH 3/6] [FIX] paths --- src/biocatalyzer/matcher.py | 2 +- tests/__init__.py | 4 +- tests/data/results_sample/matches.tsv | 5 --- tests/data/results_sample/new_compounds.tsv | 21 --------- tests/unit_tests/test_bioreactor.py | 3 +- tests/unit_tests/test_ms_matcher.py | 47 ++++++++++----------- 6 files changed, 27 insertions(+), 55 deletions(-) delete mode 100644 tests/data/results_sample/matches.tsv delete mode 100644 tests/data/results_sample/new_compounds.tsv diff --git a/src/biocatalyzer/matcher.py b/src/biocatalyzer/matcher.py index f8346dc..b5c813d 100644 --- a/src/biocatalyzer/matcher.py +++ b/src/biocatalyzer/matcher.py @@ -305,7 +305,7 @@ def generate_ms_results(self): """ t0 = time.time() self._matches = self._match_masses() - path = self._output_path / '/matches.tsv' + path = self._output_path / 'matches.tsv' self._matches.to_csv(path, sep='\t', index=False) logging.info(f"Matches saved to {path.as_posix()}") logging.info(f"{self._matches.shape[0]} matches found!") diff --git a/tests/__init__.py b/tests/__init__.py index e8a2f4a..d00728b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,3 @@ -import os +from pathlib import Path -TESTS_DATA_PATH = os.path.join(os.path.dirname(__file__), 'data') +TESTS_DATA_PATH = Path(__file__).parent / 'data' diff --git a/tests/data/results_sample/matches.tsv b/tests/data/results_sample/matches.tsv deleted file mode 100644 index d1fd84d..0000000 --- a/tests/data/results_sample/matches.tsv +++ /dev/null @@ -1,5 +0,0 @@ -Index OriginalCompoundID OriginalCompoundSmiles ParentCompoundExactMass NewCompoundID NewCompoundSmiles NewCompoundExactMass MassDiff EC_Numbers -33 ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O 335.1965 ACEBUTOLOL_c7ea3c8e-813e-4b83-8f5e-a951020fa070 CCCC#[N+]c1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1 318.1932 17.003300000000024 4.2.1.84;4.2.1.103 -88 ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 336.174 ALMOTRIPTAN_2 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)[N+]3=CCCC3)cc12 335.1657 1.008299999999963 1.5.1.27;1.5.1.15 -33 ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O 335.1965 ACEBUTOLOL_05a25e0b-e1ff-4c76-8226-a00507604d81 CCCC(=O)Nc1ccc(OCCC=[N+]C(C)C)c(C(C)=O)c1 318.1938 17.002700000000004 4.2.1.171;4.2.1.172;4.2.1.77 -44 ACECAINIDE CC[N+](CC)CCNC(c1ccc(cc1)NC(C)=O)=O 277.1785 ACECAINIDE_f869994c-25df-4b00-a32a-2f797834cf2b C=C(O)Nc1ccc(C(=O)NCC[N+](CC)CC)cc1 277.1785 0.0 diff --git a/tests/data/results_sample/new_compounds.tsv b/tests/data/results_sample/new_compounds.tsv deleted file mode 100644 index ba28af3..0000000 --- a/tests/data/results_sample/new_compounds.tsv +++ /dev/null @@ -1,21 +0,0 @@ -OriginalCompoundID OriginalCompoundSmiles OriginalReactionRuleID NewCompoundID NewCompoundSmiles NewReactionSmiles EC_Numbers -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40069 ALMOTRIPTAN_1 *c1c(*)c(O)c(*)c(*)c1O *C1=C(*)C(=O)C(*)=C(*)C1=O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O>>*c1c(*)c(O)c(*)c(*)c1O.C[NH+](C)CCc1c[n+](O)c2ccc(CS(=O)(=O)N3CCCC3)cc12 1.7.5.1 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40093 ALMOTRIPTAN_0 C=CCN(C)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>C=CCN(C)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 3.5.99.7;5.3.3.18;5.5.1.9 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40386 ALMOTRIPTAN_1 C=CCNS(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O=C=O>>C=CCNS(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1.CN(CC(=O)O)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 2.2.1.9;2.2.1.12 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40093 ALMOTRIPTAN_1 C=CN(CC)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>C=CN(CC)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 3.5.99.7;5.3.3.18;5.5.1.9 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_46241;Rule_38980 ALMOTRIPTAN_1 CC(=O)[N+](C)(C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(=O)[N+](C)(C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCS 2.3.1.78;2.3.1.82;2.3.1.5;2.3.1.32;2.3.1.87;2.3.1.178;2.3.1.157;2.3.1.48;2.3.1.57;2.3.1.60;2.3.1.81;2.3.1.102;2.3.1.80;2.3.1.108 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_39142;Rule_46241 ALMOTRIPTAN_3 CC(=O)n1cc(CC[NH+](C)C)c2cc(CS(=O)(=O)N3CCCC3)ccc21 CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(=O)n1cc(CC[NH+](C)C)c2cc(CS(=O)(=O)N3CCCC3)ccc21.CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCS 2.3.1.78;2.3.1.59;2.3.1.82;2.3.1.201;2.3.1.5;2.3.1.2;2.3.1.157;2.3.1.118;2.3.1.60;2.3.1.81;2.3.1.102;2.3.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_86443 ALMOTRIPTAN_0 CC(C(=O)O)C(O)(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C(=O)O)C(O)(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 4.1.3.30 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_9 CC(C)=CCC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_10 CC(C)=CCC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_2 CC(C)=CCC(c1ccc2[nH]cc(CC[NH+](C)C)c2c1)S(=O)(=O)N1CCCC1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC(c1ccc2[nH]cc(CC[NH+](C)C)c2c1)S(=O)(=O)N1CCCC1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_6 CC(C)=CCC1CCCN1S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC1CCCN1S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_8 CC(C)=CCC1CCN(S(=O)(=O)Cc2ccc3[nH]cc(CC[NH+](C)C)c3c2)C1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC1CCN(S(=O)(=O)Cc2ccc3[nH]cc(CC[NH+](C)C)c3c2)C1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_4 CC(C)=CCC[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_5 CC(C)=CCc1[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc2c1CC[NH+](C)C CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCc1[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc2c1CC[NH+](C)C.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_0 CC(C)=CCc1cc2[nH]cc(CC[NH+](C)C)c2cc1CS(=O)(=O)N1CCCC1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCc1cc2[nH]cc(CC[NH+](C)C)c2cc1CS(=O)(=O)N1CCCC1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_39074 ALMOTRIPTAN_29 CC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.C[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O>>CC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.Nc1ncnc2c1ncn2[C@@H]1O[C@H](CSCC[C@H](N)C(=O)O)[C@@H](O)[C@H]1O 2.1.1.284;2.1.1.142;2.1.1.281;2.1.1.163;2.1.1.271;2.1.1.143;2.1.1.133;2.1.1.41;2.1.1.106 -ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_39074 ALMOTRIPTAN_0 CC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.C[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O>>CC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C.Nc1ncnc2c1ncn2[C@@H]1O[C@H](CSCC[C@H](N)C(=O)O)[C@@H](O)[C@H]1O 2.1.1.284;2.1.1.142;2.1.1.281;2.1.1.163;2.1.1.271;2.1.1.143;2.1.1.133;2.1.1.41;2.1.1.106 -ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O Rule_41014 ACEBUTOLOL_05a25e0b-e1ff-4c76-8226-a00507604d81 CCCC(=O)Nc1ccc(OCCC=[N+]C(C)C)c(C(C)=O)c1 CCCC(=O)Nc1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1>>CCCC(=O)Nc1ccc(OCCC=[N+]C(C)C)c(C(C)=O)c1.O 4.2.1.171;4.2.1.172;4.2.1.77 -ACECAINIDE CC[N+](CC)CCNC(c1ccc(cc1)NC(C)=O)=O Rule_46352 ACECAINIDE_f869994c-25df-4b00-a32a-2f797834cf2b C=C(O)Nc1ccc(C(=O)NCC[N+](CC)CC)cc1 CC[N+](CC)CCNC(=O)c1ccc(NC(C)=O)cc1>>C=C(O)Nc1ccc(C(=O)NCC[N+](CC)CC)cc1 -ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O Rule_42947;Rule_47233 ACEBUTOLOL_c7ea3c8e-813e-4b83-8f5e-a951020fa070 CCCC#[N+]c1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1 CCCC(=O)Nc1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1>>CCCC#[N+]c1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1.O 4.2.1.84;4.2.1.103 \ No newline at end of file diff --git a/tests/unit_tests/test_bioreactor.py b/tests/unit_tests/test_bioreactor.py index 377bff5..61980c4 100644 --- a/tests/unit_tests/test_bioreactor.py +++ b/tests/unit_tests/test_bioreactor.py @@ -1,5 +1,6 @@ import os import shutil +from pathlib import Path from unittest import TestCase from biocatalyzer.bioreactor import BioReactor @@ -88,7 +89,7 @@ def test_bioreactor_properties_and_setters(self): br.new_compounds = 'random_thing' output_path = br.output_path - self.assertEqual(output_path, self.output_folder) + self.assertEqual(output_path, Path(self.output_folder)) br.output_path = self.new_output_folder shutil.rmtree(self.new_output_folder) diff --git a/tests/unit_tests/test_ms_matcher.py b/tests/unit_tests/test_ms_matcher.py index 6f93d2a..661598d 100644 --- a/tests/unit_tests/test_ms_matcher.py +++ b/tests/unit_tests/test_ms_matcher.py @@ -1,5 +1,5 @@ -import os import shutil +from pathlib import Path from unittest import TestCase import pandas as pd @@ -12,24 +12,27 @@ class MSDataMatcherTestCase(TestCase): def setUp(self): - self.output_folder = 'results/' - self.new_output_folder = 'new_output_path/' - if not os.path.exists(self.output_folder): - os.makedirs(self.output_folder) + self.output_folder = TESTS_DATA_PATH / 'results_sample' + self.new_output_folder = TESTS_DATA_PATH / 'new_results_sample' + # Ensure the directories exist + self.output_folder.mkdir(parents=True, exist_ok=True) + self.new_output_folder.mkdir(parents=True, exist_ok=True) def tearDown(self): - if os.path.exists(self.output_folder): + if self.output_folder.exists(): shutil.rmtree(self.output_folder) + if self.new_output_folder.exists(): + shutil.rmtree(self.new_output_folder) class TestMSDataMatcher(MSDataMatcherTestCase, TestCase): def test_ms_data_matcher(self): - ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv') - compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv') - ms = MSDataMatcher(ms_data_path=ms_data_path, - compounds_to_match_path=compounds_to_match, - output_path=self.output_folder, + ms_data_path = TESTS_DATA_PATH / 'ms_data_sample' / 'ms_data.tsv' + compounds_to_match = TESTS_DATA_PATH / 'new_compounds_sample' / 'new_compounds.tsv' + ms = MSDataMatcher(ms_data_path=ms_data_path.as_posix(), + compounds_to_match_path=compounds_to_match.as_posix(), + output_path=self.output_folder.as_posix(), tolerance=0.0015) ms.generate_ms_results() @@ -40,21 +43,15 @@ def test_ms_data_matcher(self): self.assertEqual(ms.matches.shape, (4, 9)) def test_ms_data_matcher_properties_and_setters(self): - ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data.tsv') - compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds.tsv') - ms = MSDataMatcher(ms_data_path=ms_data_path, - compounds_to_match_path=compounds_to_match, - output_path=self.output_folder, + ms_data_path = TESTS_DATA_PATH / 'ms_data_sample' / 'ms_data.tsv' + compounds_to_match = TESTS_DATA_PATH / 'new_compounds_sample' / 'new_compounds.tsv' + ms = MSDataMatcher(ms_data_path=ms_data_path.as_posix(), + compounds_to_match_path=compounds_to_match.as_posix(), + output_path=self.new_output_folder.as_posix(), tolerance=0.0015) output_path = ms.output_path - self.assertEqual(output_path, self.output_folder) - - ms.output_path = self.new_output_folder - shutil.rmtree(self.new_output_folder) - - with self.assertRaises(FileExistsError): - ms.output_path = os.path.join(TESTS_DATA_PATH, 'results_sample/') + self.assertEqual(output_path, Path(self.new_output_folder)) ms.generate_ms_results() @@ -62,13 +59,13 @@ def test_ms_data_matcher_properties_and_setters(self): with self.assertRaises(FileNotFoundError): ms.ms_data_path = 'not_existing_path.tsv' - ms.ms_data_path = os.path.join(TESTS_DATA_PATH, 'ms_data_sample/ms_data_subsample.tsv') + ms.ms_data_path = TESTS_DATA_PATH / 'ms_data_sample' / 'ms_data_subsample.tsv' _ = ms.compounds_to_match with self.assertRaises(FileNotFoundError): ms.compounds_to_match = 'not_existing_path.tsv' - ms.compounds_to_match = os.path.join(TESTS_DATA_PATH, 'new_compounds_sample/new_compounds_subsample.tsv') + ms.compounds_to_match = TESTS_DATA_PATH / 'new_compounds_sample' / 'new_compounds_subsample.tsv' tl = ms.tolerance ms.tolerance = 0.0015 + tl From 572507be7bd41364e3f6c4f681240e57bffb527b Mon Sep 17 00:00:00 2001 From: jcorreia11 Date: Tue, 11 Feb 2025 12:04:11 +0000 Subject: [PATCH 4/6] [FIX] paths --- tests/unit_tests/test_bioreactor.py | 75 ++++++++++++++--------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/tests/unit_tests/test_bioreactor.py b/tests/unit_tests/test_bioreactor.py index 61980c4..386ac50 100644 --- a/tests/unit_tests/test_bioreactor.py +++ b/tests/unit_tests/test_bioreactor.py @@ -1,4 +1,3 @@ -import os import shutil from pathlib import Path from unittest import TestCase @@ -11,28 +10,31 @@ class BioReactorTestCase(TestCase): def setUp(self): - self.output_folder = 'results/' - self.new_output_folder = 'new_output_path/' - if not os.path.exists(self.output_folder): - os.makedirs(self.output_folder) + self.output_folder = TESTS_DATA_PATH / 'results' + self.new_output_folder = TESTS_DATA_PATH / 'new_output_path' + # Ensure the directories exist + self.output_folder.mkdir(parents=True, exist_ok=True) + self.new_output_folder.mkdir(parents=True, exist_ok=True) def tearDown(self): - if os.path.exists(self.output_folder): + if self.output_folder.exists(): shutil.rmtree(self.output_folder) + if self.new_output_folder.exists(): + shutil.rmtree(self.new_output_folder) class TestBioReactor(BioReactorTestCase, TestCase): def test_bioreactor(self): - compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv') - organisms_path = os.path.join(TESTS_DATA_PATH, 'organisms_sample/organisms_to_use.tsv') - patterns_to_remove_path = os.path.join(TESTS_DATA_PATH, 'patterns_to_remove_sample/patterns.tsv') - molecules_to_remove_path = os.path.join(TESTS_DATA_PATH, 'byproducts_to_remove_sample/byproducts.tsv') - br = BioReactor(compounds_path=compounds_path, - organisms_path=organisms_path, + compounds_path = TESTS_DATA_PATH / 'compounds_sample' / 'compounds.tsv' + organisms_path = TESTS_DATA_PATH / 'organisms_sample' / 'organisms_to_use.tsv' + patterns_to_remove_path = TESTS_DATA_PATH / 'patterns_to_remove_sample' / 'patterns.tsv' + molecules_to_remove_path = TESTS_DATA_PATH / 'byproducts_to_remove_sample' / 'byproducts.tsv' + br = BioReactor(compounds_path=compounds_path.as_posix(), + organisms_path=organisms_path.as_posix(), patterns_to_remove_path=patterns_to_remove_path, molecules_to_remove_path=molecules_to_remove_path, - output_path=self.output_folder, + output_path=self.output_folder.as_posix(), n_jobs=12) br.react() @@ -42,13 +44,13 @@ def test_bioreactor(self): _ = br.new_compounds def test_bioreactor_all_orgs(self): - compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv') - patterns_to_remove_path = os.path.join(TESTS_DATA_PATH, 'patterns_to_remove_sample/patterns.tsv') - molecules_to_remove_path = os.path.join(TESTS_DATA_PATH, 'byproducts_to_remove_sample/byproducts.tsv') - br_no_orgs_filter = BioReactor(compounds_path=compounds_path, - patterns_to_remove_path=patterns_to_remove_path, - molecules_to_remove_path=molecules_to_remove_path, - output_path=self.output_folder, + compounds_path = TESTS_DATA_PATH / 'compounds_sample' / 'compounds.tsv' + patterns_to_remove_path = TESTS_DATA_PATH / 'patterns_to_remove_sample' / 'patterns.tsv' + molecules_to_remove_path = TESTS_DATA_PATH / 'byproducts_to_remove_sample' / 'byproducts.tsv' + br_no_orgs_filter = BioReactor(compounds_path=compounds_path.as_posix(), + patterns_to_remove_path=patterns_to_remove_path.as_posix(), + molecules_to_remove_path=molecules_to_remove_path.as_posix(), + output_path=self.output_folder.as_posix(), neutralize_compounds=True, n_jobs=12) br_no_orgs_filter.react() @@ -62,13 +64,13 @@ def test_bioreactor_all_orgs(self): self.assertEqual(r[0].shape, (3220, 7)) def test_bioreactor_all_orgs_keep_all(self): - compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv') + compounds_path = TESTS_DATA_PATH / 'compounds_sample' / 'compounds.tsv' patterns_to_remove_path = None molecules_to_remove_path = None - br_no_orgs_filter = BioReactor(compounds_path=compounds_path, + br_no_orgs_filter = BioReactor(compounds_path=compounds_path.as_posix(), patterns_to_remove_path=patterns_to_remove_path, molecules_to_remove_path=molecules_to_remove_path, - output_path=self.output_folder, + output_path=self.output_folder.as_posix(), n_jobs=-1) br_no_orgs_filter.react() @@ -76,11 +78,11 @@ def test_bioreactor_all_orgs_keep_all(self): self.assertEqual(br_no_orgs_filter.compounds.shape, (4, 2)) def test_bioreactor_properties_and_setters(self): - compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds.tsv') - organisms_path = os.path.join(TESTS_DATA_PATH, 'organisms_sample/organisms_to_use.tsv') - br = BioReactor(compounds_path=compounds_path, - organisms_path=organisms_path, - output_path=self.output_folder, + compounds_path = TESTS_DATA_PATH / 'compounds_sample' / 'compounds.tsv' + organisms_path = TESTS_DATA_PATH / 'organisms_sample' / 'organisms_to_use.tsv' + br = BioReactor(compounds_path=compounds_path.as_posix(), + organisms_path=organisms_path.as_posix(), + output_path=self.output_folder.as_posix(), n_jobs=12) with self.assertRaises(ValueError): @@ -92,24 +94,20 @@ def test_bioreactor_properties_and_setters(self): self.assertEqual(output_path, Path(self.output_folder)) br.output_path = self.new_output_folder - shutil.rmtree(self.new_output_folder) - - with self.assertRaises(FileExistsError): - br.output_path = os.path.join(TESTS_DATA_PATH, 'results_sample/') br.react() with self.assertRaises(FileNotFoundError): br.compounds = 'not_existing_path.tsv' - br.compounds = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds_subsample.tsv') + br.compounds = TESTS_DATA_PATH / 'compounds_sample' / 'compounds_subsample.tsv' br.compounds = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C;C(C1C(C(C(C(O1)O)O)O)O)O' with self.assertRaises(FileNotFoundError): br.reaction_rules = 'not_existing_path.tsv' - br.reaction_rules = os.path.join(TESTS_DATA_PATH, 'reaction_rules_sample/reactionrules_subsample.tsv') + br.reaction_rules = TESTS_DATA_PATH / 'reaction_rules_sample' / 'reactionrules_subsample.tsv' br.output_path = 'new_output_path' @@ -117,7 +115,7 @@ def test_bioreactor_properties_and_setters(self): with self.assertRaises(FileNotFoundError): br.compounds_path = 'not_existing_path.tsv' - br.compounds_path = os.path.join(TESTS_DATA_PATH, 'compounds_sample/compounds_subsample.tsv') + br.compounds_path = TESTS_DATA_PATH / 'compounds_sample' / 'compounds_subsample.tsv' _ = br.neutralize br.neutralize = True @@ -126,7 +124,7 @@ def test_bioreactor_properties_and_setters(self): with self.assertRaises(FileNotFoundError): br.organisms_path = 'not_existing_path.tsv' - br.organisms_path = os.path.join(TESTS_DATA_PATH, 'organisms_sample/organisms_subsample.tsv') + br.organisms_path = TESTS_DATA_PATH / 'organisms_sample' / 'organisms_subsample.tsv' br.organisms_path = 'hsa;eco' @@ -134,14 +132,13 @@ def test_bioreactor_properties_and_setters(self): with self.assertRaises(FileNotFoundError): br.molecules_to_remove_path = 'not_existing_path.tsv' - br.molecules_to_remove_path = os.path.join(TESTS_DATA_PATH, - 'byproducts_to_remove_sample/byproducts_subsample.tsv') + br.molecules_to_remove_path = TESTS_DATA_PATH / 'byproducts_to_remove_sample/byproducts_subsample.tsv' _ = br.patterns_to_remove_path with self.assertRaises(FileNotFoundError): br.patterns_to_remove_path = 'not_existing_path.tsv' - br.patterns_to_remove_path = os.path.join(TESTS_DATA_PATH, 'patterns_to_remove_sample/patterns_subsample.tsv') + br.patterns_to_remove_path = TESTS_DATA_PATH / 'patterns_to_remove_sample/patterns_subsample.tsv' mac = br.min_atom_count br.min_atom_count = mac + 1 From 7845793032be5d2099b162d15ddd990bc4ef5119 Mon Sep 17 00:00:00 2001 From: jcorreia11 Date: Tue, 11 Feb 2025 12:33:48 +0000 Subject: [PATCH 5/6] [ADD] test data files --- tests/data/results_sample/matches.tsv | 5 +++++ tests/data/results_sample/new_compounds.tsv | 21 +++++++++++++++++++++ tests/unit_tests/test_ms_matcher.py | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 tests/data/results_sample/matches.tsv create mode 100644 tests/data/results_sample/new_compounds.tsv diff --git a/tests/data/results_sample/matches.tsv b/tests/data/results_sample/matches.tsv new file mode 100644 index 0000000..d1fd84d --- /dev/null +++ b/tests/data/results_sample/matches.tsv @@ -0,0 +1,5 @@ +Index OriginalCompoundID OriginalCompoundSmiles ParentCompoundExactMass NewCompoundID NewCompoundSmiles NewCompoundExactMass MassDiff EC_Numbers +33 ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O 335.1965 ACEBUTOLOL_c7ea3c8e-813e-4b83-8f5e-a951020fa070 CCCC#[N+]c1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1 318.1932 17.003300000000024 4.2.1.84;4.2.1.103 +88 ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 336.174 ALMOTRIPTAN_2 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)[N+]3=CCCC3)cc12 335.1657 1.008299999999963 1.5.1.27;1.5.1.15 +33 ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O 335.1965 ACEBUTOLOL_05a25e0b-e1ff-4c76-8226-a00507604d81 CCCC(=O)Nc1ccc(OCCC=[N+]C(C)C)c(C(C)=O)c1 318.1938 17.002700000000004 4.2.1.171;4.2.1.172;4.2.1.77 +44 ACECAINIDE CC[N+](CC)CCNC(c1ccc(cc1)NC(C)=O)=O 277.1785 ACECAINIDE_f869994c-25df-4b00-a32a-2f797834cf2b C=C(O)Nc1ccc(C(=O)NCC[N+](CC)CC)cc1 277.1785 0.0 diff --git a/tests/data/results_sample/new_compounds.tsv b/tests/data/results_sample/new_compounds.tsv new file mode 100644 index 0000000..ba28af3 --- /dev/null +++ b/tests/data/results_sample/new_compounds.tsv @@ -0,0 +1,21 @@ +OriginalCompoundID OriginalCompoundSmiles OriginalReactionRuleID NewCompoundID NewCompoundSmiles NewReactionSmiles EC_Numbers +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40069 ALMOTRIPTAN_1 *c1c(*)c(O)c(*)c(*)c1O *C1=C(*)C(=O)C(*)=C(*)C1=O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O>>*c1c(*)c(O)c(*)c(*)c1O.C[NH+](C)CCc1c[n+](O)c2ccc(CS(=O)(=O)N3CCCC3)cc12 1.7.5.1 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40093 ALMOTRIPTAN_0 C=CCN(C)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>C=CCN(C)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 3.5.99.7;5.3.3.18;5.5.1.9 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40386 ALMOTRIPTAN_1 C=CCNS(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O=C=O>>C=CCNS(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1.CN(CC(=O)O)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 2.2.1.9;2.2.1.12 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_40093 ALMOTRIPTAN_1 C=CN(CC)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>C=CN(CC)S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 3.5.99.7;5.3.3.18;5.5.1.9 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_46241;Rule_38980 ALMOTRIPTAN_1 CC(=O)[N+](C)(C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(=O)[N+](C)(C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCS 2.3.1.78;2.3.1.82;2.3.1.5;2.3.1.32;2.3.1.87;2.3.1.178;2.3.1.157;2.3.1.48;2.3.1.57;2.3.1.60;2.3.1.81;2.3.1.102;2.3.1.80;2.3.1.108 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_39142;Rule_46241 ALMOTRIPTAN_3 CC(=O)n1cc(CC[NH+](C)C)c2cc(CS(=O)(=O)N3CCCC3)ccc21 CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(=O)n1cc(CC[NH+](C)C)c2cc(CS(=O)(=O)N3CCCC3)ccc21.CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCS 2.3.1.78;2.3.1.59;2.3.1.82;2.3.1.201;2.3.1.5;2.3.1.2;2.3.1.157;2.3.1.118;2.3.1.60;2.3.1.81;2.3.1.102;2.3.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_86443 ALMOTRIPTAN_0 CC(C(=O)O)C(O)(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C(=O)O)C(O)(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 4.1.3.30 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_9 CC(C)=CCC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_10 CC(C)=CCC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_2 CC(C)=CCC(c1ccc2[nH]cc(CC[NH+](C)C)c2c1)S(=O)(=O)N1CCCC1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC(c1ccc2[nH]cc(CC[NH+](C)C)c2c1)S(=O)(=O)N1CCCC1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_6 CC(C)=CCC1CCCN1S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC1CCCN1S(=O)(=O)Cc1ccc2[nH]cc(CC[NH+](C)C)c2c1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_8 CC(C)=CCC1CCN(S(=O)(=O)Cc2ccc3[nH]cc(CC[NH+](C)C)c3c2)C1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC1CCN(S(=O)(=O)Cc2ccc3[nH]cc(CC[NH+](C)C)c3c2)C1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_4 CC(C)=CCC[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCC[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_5 CC(C)=CCc1[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc2c1CC[NH+](C)C CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCc1[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc2c1CC[NH+](C)C.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_42434 ALMOTRIPTAN_0 CC(C)=CCc1cc2[nH]cc(CC[NH+](C)C)c2cc1CS(=O)(=O)N1CCCC1 CC(C)=CCOP(=O)(O)OP(=O)(O)O.C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12>>CC(C)=CCc1cc2[nH]cc(CC[NH+](C)C)c2cc1CS(=O)(=O)N1CCCC1.O=P(O)(O)OP(=O)(O)O 2.5.1.106;2.5.1.10;2.5.1.80 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_39074 ALMOTRIPTAN_29 CC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.C[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O>>CC(C[NH+](C)C)c1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.Nc1ncnc2c1ncn2[C@@H]1O[C@H](CSCC[C@H](N)C(=O)O)[C@@H](O)[C@H]1O 2.1.1.284;2.1.1.142;2.1.1.281;2.1.1.163;2.1.1.271;2.1.1.143;2.1.1.133;2.1.1.41;2.1.1.106 +ALMOTRIPTAN C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12 Rule_39074 ALMOTRIPTAN_0 CC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C C[NH+](C)CCc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12.C[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O>>CC(Cc1c[nH]c2ccc(CS(=O)(=O)N3CCCC3)cc12)[NH+](C)C.Nc1ncnc2c1ncn2[C@@H]1O[C@H](CSCC[C@H](N)C(=O)O)[C@@H](O)[C@H]1O 2.1.1.284;2.1.1.142;2.1.1.281;2.1.1.163;2.1.1.271;2.1.1.143;2.1.1.133;2.1.1.41;2.1.1.106 +ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O Rule_41014 ACEBUTOLOL_05a25e0b-e1ff-4c76-8226-a00507604d81 CCCC(=O)Nc1ccc(OCCC=[N+]C(C)C)c(C(C)=O)c1 CCCC(=O)Nc1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1>>CCCC(=O)Nc1ccc(OCCC=[N+]C(C)C)c(C(C)=O)c1.O 4.2.1.171;4.2.1.172;4.2.1.77 +ACECAINIDE CC[N+](CC)CCNC(c1ccc(cc1)NC(C)=O)=O Rule_46352 ACECAINIDE_f869994c-25df-4b00-a32a-2f797834cf2b C=C(O)Nc1ccc(C(=O)NCC[N+](CC)CC)cc1 CC[N+](CC)CCNC(=O)c1ccc(NC(C)=O)cc1>>C=C(O)Nc1ccc(C(=O)NCC[N+](CC)CC)cc1 +ACEBUTOLOL CCCC(Nc1ccc(c(c1)C(C)=O)OCC(C[N+]C(C)C)O)=O Rule_42947;Rule_47233 ACEBUTOLOL_c7ea3c8e-813e-4b83-8f5e-a951020fa070 CCCC#[N+]c1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1 CCCC(=O)Nc1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1>>CCCC#[N+]c1ccc(OCC(O)C[N+]C(C)C)c(C(C)=O)c1.O 4.2.1.84;4.2.1.103 \ No newline at end of file diff --git a/tests/unit_tests/test_ms_matcher.py b/tests/unit_tests/test_ms_matcher.py index 661598d..f530fa9 100644 --- a/tests/unit_tests/test_ms_matcher.py +++ b/tests/unit_tests/test_ms_matcher.py @@ -12,7 +12,7 @@ class MSDataMatcherTestCase(TestCase): def setUp(self): - self.output_folder = TESTS_DATA_PATH / 'results_sample' + self.output_folder = TESTS_DATA_PATH / 'results_sample2' self.new_output_folder = TESTS_DATA_PATH / 'new_results_sample' # Ensure the directories exist self.output_folder.mkdir(parents=True, exist_ok=True) From ccfe1addad88d9ca116e91df0883a6a21bc147fc Mon Sep 17 00:00:00 2001 From: jcorreia11 Date: Tue, 11 Feb 2025 13:32:42 +0000 Subject: [PATCH 6/6] [FIX] multiprocessing for cross platform integration --- src/biocatalyzer/bioreactor.py | 77 +++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/src/biocatalyzer/bioreactor.py b/src/biocatalyzer/bioreactor.py index cfc716a..ef7d4e3 100644 --- a/src/biocatalyzer/bioreactor.py +++ b/src/biocatalyzer/bioreactor.py @@ -1,6 +1,8 @@ import itertools import logging import multiprocessing +import os +import tempfile import time import uuid from typing import Union @@ -572,7 +574,7 @@ def process_results(self, save: bool = True, overwrite: bool = True): results_file_proc = self._new_compounds_path return results, results_file_proc - def _react_single(self, smiles: str, smarts: str): + def _react_single(self, smiles: str, smarts: str, result_queue: multiprocessing.Queue): """ React a single compound with a single reaction rule. Writes the results to the output files. @@ -583,46 +585,71 @@ def _react_single(self, smiles: str, smarts: str): The smiles of the reactant. smarts: str The SMARTS string of the reaction. + result_queue: multiprocessing.Queue + The queue to store the results. """ reactants = self._reaction_rules[self._reaction_rules.SMARTS == smarts].Reactants.values[0] reactants = reactants.replace("Any", smiles).split(';') results = ChemUtils.react(reactants, smarts) - if len(results) > 0: - smiles_id = self._compounds[self._compounds.smiles == smiles].compound_id.values[0] - smarts_id = self._reaction_rules[self._reaction_rules.SMARTS == smarts].InternalID.values[0] - most_similar_products_set = set() - for i, result in enumerate(results): - products = result.split('>')[-1].split('.') - # keep only the most similar compound to the input compound - most_similar_product = ChemUtils.most_similar_compound(smiles, products) - most_similar_product = ChemUtils.smiles_to_isomerical_smiles(most_similar_product) - if most_similar_product not in most_similar_products_set: - most_similar_products_set.add(most_similar_product) - if self._match_conditions(most_similar_product): - if self._neutralize: - most_similar_product = ChemUtils.uncharge_smiles(most_similar_product) - ecs = self._get_ec_numbers(smarts_id) - new_compound_data = ( - f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t" - f"{most_similar_product}\t{result}\t{ecs}\n" - ) - with open(self._new_compounds_path, 'a') as f: - f.write(new_compound_data) + if len(results) == 0: + return + smiles_id = self._compounds[self._compounds.smiles == smiles].compound_id.values[0] + smarts_id = self._reaction_rules[self._reaction_rules.SMARTS == smarts].InternalID.values[0] + most_similar_products_set = set() + # Collect results in a list + output_rows = [] + for result in results: + products = result.split('>')[-1].split('.') + most_similar_product = ChemUtils.most_similar_compound(smiles, products) + most_similar_product = ChemUtils.smiles_to_isomerical_smiles(most_similar_product) + + if most_similar_product not in most_similar_products_set: + most_similar_products_set.add(most_similar_product) + if self._match_conditions(most_similar_product): + if self._neutralize: + most_similar_product = ChemUtils.uncharge_smiles(most_similar_product) + ecs = self._get_ec_numbers(smarts_id) + output_rows.append(f"{smiles_id}\t{smiles}\t{smarts_id}\t{smiles_id}_{uuid.uuid4()}\t" + f"{most_similar_product}\t{result}\t{ecs}\n") + + # Write output to a temporary file, then add the filename to the result queue + if output_rows: + temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w', newline='\n') + with open(temp_file.name, 'w') as f: + f.writelines(output_rows) + result_queue.put(temp_file.name) def react(self): """ Transform reactants into products using the reaction rules. + Writes results incrementally and handles large files. """ t0 = time.time() header = ( 'OriginalCompoundID\tOriginalCompoundSmiles\tOriginalReactionRuleID\tNewCompoundID\t' 'NewCompoundSmiles\tNewReactionSmiles\tEC_Numbers\n' ) - with open(self._new_compounds_path, 'w') as f: + # Ensure header is written to the final output file + with open(self._new_compounds_path, 'w', newline='\n') as f: f.write(header) + params = list(itertools.product(self._compounds.smiles, self._reaction_rules.SMARTS)) - with multiprocessing.Pool(self._n_jobs) as pool: - pool.starmap(self._react_single, tqdm(params, total=len(params))) + # Create a multiprocessing Manager to hold the result queue + with multiprocessing.Manager() as manager: + result_queue = manager.Queue() + + # Start the multiprocessing pool + with multiprocessing.Pool(self._n_jobs) as pool: + pool.starmap(self._react_single, [(smiles, smarts, result_queue) for smiles, smarts in params]) + + # Once all processes are done, write the results from all temporary files + with open(self._new_compounds_path, 'a', newline='\n') as f: + while not result_queue.empty(): + temp_file = result_queue.get() + with open(temp_file, 'r') as temp_f: + f.write(temp_f.read()) + os.remove(temp_file) # Clean up the temporary file + self._new_compounds = f"New products saved to {self._new_compounds_path}" t1 = time.time() logging.info(f"Time elapsed: {t1 - t0} seconds")