-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMetaMini_KG_pipeline.smk
More file actions
122 lines (110 loc) · 5.02 KB
/
MetaMini_KG_pipeline.smk
File metadata and controls
122 lines (110 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Snake Make
## Import Config Files
configfile: "./config.yml"
## Import Python libraries
import os, sys
import subprocess
## Define Global Variables
ROOT_PATH = os.getcwd() # /scratch1/sjw6257; base working directory
DATA_PATH = os.path.join(ROOT_PATH, 'data')
MICRO_DATA_PATH = os.path.join(DATA_PATH, 'microbiomeKG_data') # microbiomeKG_data/Microbiome_KG_nodes_v0.2.1.tsv; directory where MicroKG is located
SCRIPT_PATH = os.path.join(ROOT_PATH, 'microKG_NS') # microKG_NS is where all the scripts are located
STAT_PATH = os.path.join(ROOT_PATH, 'syn_stat') # This is where stats are located (round 1 and 2)
DATA_SYN_DIR = os.path.join(ROOT_PATH, "data_syn") # This is where synonymized data are located
MICROBIOME_KG_VERSION = config['KG_VARIABLES']['MICROBIOME_KG_VERSION'] # modify config file to latest version
FILENAME = f"Microbiome_KG_nodes_{MICROBIOME_KG_VERSION}.tsv"
NODE_FILE_PATH = os.path.join(MICRO_DATA_PATH, FILENAME) # Full path to the node file
MICROBIOME_KG_DOWNLOAD_URL = "https://db.systemsbiology.net/gestalt/KG/"
NODE_SYN_DB = config['KG_VARIABLES']['NODE_SYN_DIR']
FAILED_BOTH_INPUT = os.path.join(STAT_PATH, "failed_both.csv")
FAILED_BOTH_OUTPUT = os.path.join(STAT_PATH, "processed_failed_both.tsv")
FAILED_BOTH_SCRIPT = os.path.join(SCRIPT_PATH, "failed_both_processing.py")
## Create Required Folders
for folder in [
os.path.join(ROOT_PATH, "data_syn"),
os.path.join(ROOT_PATH, "syn_stat"),
os.path.join(STAT_PATH, "syn_stat_failed_both")
]:
os.makedirs(folder, exist_ok=True)
# Download MicrobiomeKG Node data
if not os.path.exists(NODE_FILE_PATH):
print(f"{FILENAME} does not exist. Attempting to download...")
result = subprocess.run(
["wget", "-O", NODE_FILE_PATH, MICROBIOME_KG_DOWNLOAD_URL + FILENAME],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
if result.returncode == 0:
print(f"File {NODE_FILE_PATH} downloaded successfully!")
else:
print(f"Error downloading file {NODE_FILE_PATH}.")
else:
print(f"File {NODE_FILE_PATH} already exists, skipping download.")
## Build Rules
rule targets:
input:
os.path.join(ROOT_PATH, "data_syn", f"Microbiome_KG_nodes_syn_{MICROBIOME_KG_VERSION}.tsv"),
os.path.join(ROOT_PATH, "syn_stat", "failed_curies.csv"),
os.path.join(ROOT_PATH, "syn_stat", "failed_both.csv"),
os.path.join(ROOT_PATH, "syn_stat", "total_success.csv"),
os.path.join(STAT_PATH, "syn_stat_failed_both", "failed_curies2.csv"),
os.path.join(STAT_PATH, "syn_stat_failed_both", "failed_both2.csv"),
os.path.join(STAT_PATH, "syn_stat_failed_both", "total_success2.csv")
# Node synonymize MicroKG nodes (round 1)
rule run_nodesyn_microkg:
input:
file = NODE_FILE_PATH,
script = os.path.join(SCRIPT_PATH, 'node_syn_microkg.py'),
syn_db = NODE_SYN_DB
output:
os.path.join(ROOT_PATH, "data_syn", f"Microbiome_KG_nodes_syn_{MICROBIOME_KG_VERSION}.tsv"),
os.path.join(ROOT_PATH, "syn_stat", "failed_curies.csv"),
os.path.join(ROOT_PATH, "syn_stat", "failed_both.csv"),
os.path.join(ROOT_PATH, "syn_stat", "total_success.csv")
params:
data_syn_dir = directory(os.path.join(ROOT_PATH, "data_syn")),
syn_stat_dir = directory(os.path.join(ROOT_PATH, "syn_stat"))
run:
"""
python {input.script} --input {input.file} --syn_db {input.syn_db} --output {params.data_syn_dir} --syn_stat_dir {params.syn_stat_dir}
"""
rule stat_node_syn:
input:
file = NODE_FILE_PATH,
script = os.path.join(SCRIPT_PATH, 'MicroKG_Stat.py')
output:
os.path.join(ROOT_PATH, "syn_stat", "NS_performance_summary.txt"),
os.path.join(ROOT_PATH, "syn_stat", "wrong_name.csv"),
os.path.join(ROOT_PATH, "syn_stat", "no_name.csv"),
os.path.join(ROOT_PATH, "syn_stat", "curie_not_mapping.csv"),
os.path.join(ROOT_PATH, "syn_stat", "name_not_mapping.csv")
params:
node_syn_result = os.path.join(ROOT_PATH, "data_syn", f"Microbiome_KG_nodes_syn_{MICROBIOME_KG_VERSION}.tsv")
run:
"""
python {input.script} --input {input.file} --stat_dir syn_stat --result_file {params.node_syn_result}
"""
rule failed_both_processing:
input:
file = FAILED_BOTH_INPUT
script=FAILED_BOTH_SCRIPT
output:
result = FAILED_BOTH_OUTPUT
shell:
"""
python {input.script} --input {input.file} --output {output.result}
"""
rule node_syn_failed_both:
input:
file = FAILED_BOTH_OUTPUT
syn_db = NODE_SYN_DB
script = os.path.join(SCRIPT_PATH, 'NodeSyn_failed_both.py')
output:
os.path.join(ROOT_PATH, "data_syn", "failed_both_syn_v0.2.1.tsv"),
os.path.join(STAT_PATH, "syn_stat_failed_both", "failed_curies2.csv"),
os.path.join(STAT_PATH, "syn_stat_failed_both", "failed_both2.csv"),
os.path.join(STAT_PATH, "syn_stat_failed_both", "total_success2.csv")
shell:
"""
python {input.script} --input {input.file} --syn_db {input.syn_db} --output {output}
"""