HomologyTesting/generate-csv.py at master · ndaniels/HomologyTesting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#!/usr/bin/env python
#
# generate-csv.py - generate a CSV file summarizing the results of
# generate-positive-controls.py and generate-negative-controls.py
# Copyright 2010 Jeffrey Finkelstein
#
# This file is part of smurf.
#
# smurf is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 2 of the License, or (at your option) any later
# version.
#
# smurf is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# smurf.  If not, see <http://www.gnu.org/licenses/>.

import os
import os.path
import sys

from gargamel.argumentparsers import BaseArgumentParser
from gargamel.argumentparsers import AlignmentArgumentParser
from gargamel.constants import HMMER
from gargamel.constants import MRFY
from gargamel.constants import SMURF
from gargamel.constants import PROFILE_SMURF
from gargamel.constants import SMURF_LITE
from gargamel.constants import POSITIVE_DIRNAME
from gargamel.constants import NEGATIVE_DIRNAME
from gargamel.logger import logger
from gargamel.hmmer import result_from_file

# a brief description of the purpose of this program
PROGRAM_DESCRIPTION = ('Generates a CSV file summarizing the results of '
                       'generate-positive-controls.py and '
                       'generate-negative-controls.py')

# exit status code representing no output directory found
STATUS_NO_DIR = 1 << 0

# the name of the file containing the CSV file
CSV_FILENAME = 'summary.csv'

# the suffix for file-specific README files
README_SUFFIX = '.README'

CSV_README = \
"""This file contains a summary of the alignments generated by smurf which are
the positive and negative controls for leave-one-out cross-validation. The
fields in each record of the CSV file are

<pdbid>:<chainid>:<family-sunid>:<rawscore>:<positive_or_negative>

where

<pdbid>, a string of length exactly four, is the PDB ID of a protein,
<chainid>, a single character, is the identifying character of the protein
  chain,
<family-sunid>, an unsigned integer, is the SCOP SUNID of the family which
  contains this chain,
<rawscore>, a floating point real number, is the raw score as generated by the
  smurf alignment of this protein, and
<positive_or_negative>, exactly one of the strings "positive" or "negative", is
  a string representing whether this was a positive control or a negative
  control in the alignment test."""

#<superfamily-sunid>, an unsigned integer, is the SCOP SUNID of the superfamily
#  which contains this chain,

def write_csv_readme(filename):
    """Writes README information for the CSV file created by this script."""
    with open(filename, 'w') as f:
        f.write(CSV_README + '\n')

def parse_mrfy_output(filename):
    logger.debug('parsing from file: ' + filename)
    with open(filename, 'r') as f:
        pdb_str = f.readline()[1:7]
        # logger.debug('read: ' + pdb_str)
        if len(pdb_str) < 6:
          return None, None, None
        pdbid, chainid = pdb_str.split('_')
        read_value = f.readline()
        if read_value[0:8] == 'Sequence':
          return pdbid, chainid, None
        strVal = read_value[11:]
        if strVal[0:7] == 'Infinity':
          return pdbid, chainid, 1000000
        rawscore = float(strVal)
        # logger.debug('read: ' + str(rawscore))
    return pdbid, chainid, rawscore


def parse_smurf_output(filename):
    logger.debug('parsing from file: ' + filename)
    with open(filename, 'r') as f:
        pdb_str = f.readline()[1:7]
        # logger.debug('read: ' + pdb_str)
        if len(pdb_str) < 6:
          return None, None, None
        pdbid, chainid = pdb_str.split('_')
        read_value = f.readline()
        if read_value[0:8] == 'Sequence':
          return pdbid, chainid, None
        rawscore = float(read_value[11:])
        # logger.debug('read: ' + str(rawscore))
    return pdbid, chainid, rawscore

def parse_hmmer_output(filename):
    logger.debug('parsing from file: ' + filename)
    result = result_from_file(filename)
    if result == None:
      return 'Unknown', 'Unknown', None
    pdbid, chainid = result.pdbid, result.chain
    # logger.debug('result: ' + str(result))
    rawscore = float(result.full_seq_score.score)
    return  pdbid, chainid, rawscore

# create an argument parser and parse the command-line arguments
argparser = AlignmentArgumentParser(PROGRAM_DESCRIPTION)
parsed_args = argparser.parse_args()

# the directory containing the results from the smurf/hmmer alignment tests
output_dir = parsed_args.outputdir
aligner = parsed_args.aligner
# logger.debug('Aligner: ' + aligner)
# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# a summary of the runtime configuration of this program
config = {'output_dir' : output_dir}
logger.debug('Program configuration: ' + str(config))

# check if the output dir exists
if not os.path.isdir(output_dir):
    logger.critical('Directory ' + output_dir + ' does not exist.')
    sys.exit(STATUS_NO_DIR)

# find all subdirectories of the output directory
logger.debug('Determining which family directories exist...')
logger.debug('  output_dir contains: ' + str(os.listdir(output_dir)))
subdirectories = filter(lambda x: os.path.isdir(os.path.join(output_dir, x)),
                        os.listdir(output_dir))
logger.debug('  subdirectories: ' + str(subdirectories))
# get all subdirectories which are only digits
# TODO this is not the best way to do this
families = filter(lambda x: all(filter(lambda y: y.isdigit(), x)),
                  subdirectories)
logger.debug('  families: '  + str(families))

# touch the CSV file so that it is empty
csv_filename = os.path.join(output_dir, aligner + '_' + CSV_FILENAME)
logger.debug('Clearing CSV file...')
with open(csv_filename, 'w') as f: pass

# write some README information for the CSV file
write_csv_readme(csv_filename + README_SUFFIX)

# iterator over each family
for family in families:

    # generate the names of the directories where (we assume) the results live
    family_output_dir = os.path.join(output_dir, family)
    positive_dir = os.path.join(family_output_dir, aligner, POSITIVE_DIRNAME)
    negative_dir = os.path.join(family_output_dir, aligner, NEGATIVE_DIRNAME)

    # require both of the directories exist
    if not os.path.isdir(positive_dir):
        logger.critical('Expected directory at ' + positive_dir)
        logger.critical('Not generating output for this family')
        break
    # if not os.path.isdir(positive_dir):
    #     logger.critical('Expected directory at ' + positive_dir)
    #     logger.critical('Not generating output for this family')
    #     break

    skip_negative = False
    if not os.path.isdir(negative_dir):
      logger.debug('Skipping negative directory at ' + negative_dir)
      skip_negative = True

    # get all the resulting smurf alignment files, excluding the README files
    positive_files = [os.path.join(positive_dir, f) for f in
                      os.listdir(positive_dir) if not
                      f.endswith(README_SUFFIX)]

    if not skip_negative:
      negative_files = [os.path.join(negative_dir, f) for f in
                        os.listdir(negative_dir) if not
                        f.endswith(README_SUFFIX)]

    # open the CSV file for appending
    logger.debug('Opening CSV file ' + csv_filename)
    with open(csv_filename, 'a') as csv_file:

        # iterate over each file in the positive directory
        for f in positive_files:
            if aligner == SMURF or aligner == PROFILE_SMURF or aligner == SMURF_LITE:
              # get the PDB ID, chain ID, and raw score
              pdbid, chainid, rawscore = parse_smurf_output(f)
              if rawscore == None:
                rawscore == -1000.0
            elif aligner == MRFY:
              pdbid, chainid, rawscore = parse_mrfy_output(f)
              if rawscore == None:
                rawscore == 1000000
            else:
              pdbid, chainid, rawscore = parse_hmmer_output(f)
              if rawscore == None:
                rawscore = -1000.0

            # write to the csv file
            if pdbid != None:
              csv_file.write(','.join((pdbid, chainid, family, str(rawscore),
                                       'positive')) + '\n')


        if not skip_negative:
          # iterate over each file in the negative directory
          for f in negative_files:
              if aligner == SMURF or aligner == PROFILE_SMURF or aligner == SMURF_LITE:
                # get the PDB ID, chain ID, and raw score
                pdbid, chainid, rawscore = parse_smurf_output(f)
                if rawscore == None:
                  rawscore == -1000.0
              elif aligner == MRFY:
                pdbid, chainid, rawscore = parse_mrfy_output(f)
                if rawscore == None:
                  rawscore == 1000000
              else:
                pdbid, chainid, rawscore = parse_hmmer_output(f)
                if rawscore == None:
                  rawscore == -1000.0


              # write to the csv file
              if pdbid != None:
                csv_file.write(','.join((pdbid, chainid, family, str(rawscore),
                                         'negative')) + '\n')