-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate-csv.py
More file actions
executable file
·247 lines (210 loc) · 9.53 KB
/
generate-csv.py
File metadata and controls
executable file
·247 lines (210 loc) · 9.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/usr/bin/env python
#
# generate-csv.py - generate a CSV file summarizing the results of
# generate-positive-controls.py and generate-negative-controls.py
# Copyright 2010 Jeffrey Finkelstein
#
# This file is part of smurf.
#
# smurf is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 2 of the License, or (at your option) any later
# version.
#
# smurf is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# smurf. If not, see <http://www.gnu.org/licenses/>.
import os
import os.path
import sys
from gargamel.argumentparsers import BaseArgumentParser
from gargamel.argumentparsers import AlignmentArgumentParser
from gargamel.constants import HMMER
from gargamel.constants import MRFY
from gargamel.constants import SMURF
from gargamel.constants import PROFILE_SMURF
from gargamel.constants import SMURF_LITE
from gargamel.constants import POSITIVE_DIRNAME
from gargamel.constants import NEGATIVE_DIRNAME
from gargamel.logger import logger
from gargamel.hmmer import result_from_file
# a brief description of the purpose of this program
PROGRAM_DESCRIPTION = ('Generates a CSV file summarizing the results of '
'generate-positive-controls.py and '
'generate-negative-controls.py')
# exit status code representing no output directory found
STATUS_NO_DIR = 1 << 0
# the name of the file containing the CSV file
CSV_FILENAME = 'summary.csv'
# the suffix for file-specific README files
README_SUFFIX = '.README'
CSV_README = \
"""This file contains a summary of the alignments generated by smurf which are
the positive and negative controls for leave-one-out cross-validation. The
fields in each record of the CSV file are
<pdbid>:<chainid>:<family-sunid>:<rawscore>:<positive_or_negative>
where
<pdbid>, a string of length exactly four, is the PDB ID of a protein,
<chainid>, a single character, is the identifying character of the protein
chain,
<family-sunid>, an unsigned integer, is the SCOP SUNID of the family which
contains this chain,
<rawscore>, a floating point real number, is the raw score as generated by the
smurf alignment of this protein, and
<positive_or_negative>, exactly one of the strings "positive" or "negative", is
a string representing whether this was a positive control or a negative
control in the alignment test."""
#<superfamily-sunid>, an unsigned integer, is the SCOP SUNID of the superfamily
# which contains this chain,
def write_csv_readme(filename):
"""Writes README information for the CSV file created by this script."""
with open(filename, 'w') as f:
f.write(CSV_README + '\n')
def parse_mrfy_output(filename):
logger.debug('parsing from file: ' + filename)
with open(filename, 'r') as f:
pdb_str = f.readline()[1:7]
# logger.debug('read: ' + pdb_str)
if len(pdb_str) < 6:
return None, None, None
pdbid, chainid = pdb_str.split('_')
read_value = f.readline()
if read_value[0:8] == 'Sequence':
return pdbid, chainid, None
strVal = read_value[11:]
if strVal[0:7] == 'Infinity':
return pdbid, chainid, 1000000
rawscore = float(strVal)
# logger.debug('read: ' + str(rawscore))
return pdbid, chainid, rawscore
def parse_smurf_output(filename):
logger.debug('parsing from file: ' + filename)
with open(filename, 'r') as f:
pdb_str = f.readline()[1:7]
# logger.debug('read: ' + pdb_str)
if len(pdb_str) < 6:
return None, None, None
pdbid, chainid = pdb_str.split('_')
read_value = f.readline()
if read_value[0:8] == 'Sequence':
return pdbid, chainid, None
rawscore = float(read_value[11:])
# logger.debug('read: ' + str(rawscore))
return pdbid, chainid, rawscore
def parse_hmmer_output(filename):
logger.debug('parsing from file: ' + filename)
result = result_from_file(filename)
if result == None:
return 'Unknown', 'Unknown', None
pdbid, chainid = result.pdbid, result.chain
# logger.debug('result: ' + str(result))
rawscore = float(result.full_seq_score.score)
return pdbid, chainid, rawscore
# create an argument parser and parse the command-line arguments
argparser = AlignmentArgumentParser(PROGRAM_DESCRIPTION)
parsed_args = argparser.parse_args()
# the directory containing the results from the smurf/hmmer alignment tests
output_dir = parsed_args.outputdir
aligner = parsed_args.aligner
# logger.debug('Aligner: ' + aligner)
# determine the amount of logging info to output
if parsed_args.verbose:
from logging import DEBUG
from gargamel.logger import console_handler
console_handler.setLevel(DEBUG)
# a summary of the runtime configuration of this program
config = {'output_dir' : output_dir}
logger.debug('Program configuration: ' + str(config))
# check if the output dir exists
if not os.path.isdir(output_dir):
logger.critical('Directory ' + output_dir + ' does not exist.')
sys.exit(STATUS_NO_DIR)
# find all subdirectories of the output directory
logger.debug('Determining which family directories exist...')
logger.debug(' output_dir contains: ' + str(os.listdir(output_dir)))
subdirectories = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
os.listdir(output_dir))
logger.debug(' subdirectories: ' + str(subdirectories))
# get all subdirectories which are only digits
# TODO this is not the best way to do this
families = filter(lambda x: all(filter(lambda y: y.isdigit(), x)),
subdirectories)
logger.debug(' families: ' + str(families))
# touch the CSV file so that it is empty
csv_filename = os.path.join(output_dir, aligner + '_' + CSV_FILENAME)
logger.debug('Clearing CSV file...')
with open(csv_filename, 'w') as f: pass
# write some README information for the CSV file
write_csv_readme(csv_filename + README_SUFFIX)
# iterator over each family
for family in families:
# generate the names of the directories where (we assume) the results live
family_output_dir = os.path.join(output_dir, family)
positive_dir = os.path.join(family_output_dir, aligner, POSITIVE_DIRNAME)
negative_dir = os.path.join(family_output_dir, aligner, NEGATIVE_DIRNAME)
# require both of the directories exist
if not os.path.isdir(positive_dir):
logger.critical('Expected directory at ' + positive_dir)
logger.critical('Not generating output for this family')
break
# if not os.path.isdir(positive_dir):
# logger.critical('Expected directory at ' + positive_dir)
# logger.critical('Not generating output for this family')
# break
skip_negative = False
if not os.path.isdir(negative_dir):
logger.debug('Skipping negative directory at ' + negative_dir)
skip_negative = True
# get all the resulting smurf alignment files, excluding the README files
positive_files = [os.path.join(positive_dir, f) for f in
os.listdir(positive_dir) if not
f.endswith(README_SUFFIX)]
if not skip_negative:
negative_files = [os.path.join(negative_dir, f) for f in
os.listdir(negative_dir) if not
f.endswith(README_SUFFIX)]
# open the CSV file for appending
logger.debug('Opening CSV file ' + csv_filename)
with open(csv_filename, 'a') as csv_file:
# iterate over each file in the positive directory
for f in positive_files:
if aligner == SMURF or aligner == PROFILE_SMURF or aligner == SMURF_LITE:
# get the PDB ID, chain ID, and raw score
pdbid, chainid, rawscore = parse_smurf_output(f)
if rawscore == None:
rawscore == -1000.0
elif aligner == MRFY:
pdbid, chainid, rawscore = parse_mrfy_output(f)
if rawscore == None:
rawscore == 1000000
else:
pdbid, chainid, rawscore = parse_hmmer_output(f)
if rawscore == None:
rawscore = -1000.0
# write to the csv file
if pdbid != None:
csv_file.write(','.join((pdbid, chainid, family, str(rawscore),
'positive')) + '\n')
if not skip_negative:
# iterate over each file in the negative directory
for f in negative_files:
if aligner == SMURF or aligner == PROFILE_SMURF or aligner == SMURF_LITE:
# get the PDB ID, chain ID, and raw score
pdbid, chainid, rawscore = parse_smurf_output(f)
if rawscore == None:
rawscore == -1000.0
elif aligner == MRFY:
pdbid, chainid, rawscore = parse_mrfy_output(f)
if rawscore == None:
rawscore == 1000000
else:
pdbid, chainid, rawscore = parse_hmmer_output(f)
if rawscore == None:
rawscore == -1000.0
# write to the csv file
if pdbid != None:
csv_file.write(','.join((pdbid, chainid, family, str(rawscore),
'negative')) + '\n')