HomologyTesting/generate-matt-alignments.py at master · ndaniels/HomologyTesting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python
#
# generate-matt-alignments.py - generates matt alignments of members of a
# level of the SCOP hierarchy, leaving out one lower lever each time
# Copyright 2010 Jeffrey Finkelstein
#
# This file is part of smurf.
#
# smurf is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 2 of the License, or (at your option) any later
# version.
#
# smurf is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# smurf.  If not, see <http://www.gnu.org/licenses/>.

from gargamel.util import require_python_version
require_python_version(2, 7)

import os
import os.path
import subprocess
import sys

from gargamel.argumentparsers import MultipleAlignmentArgumentParser
from gargamel.argumentparsers import DEFAULT_REP_ARG
from gargamel.constants import MATT_PREFIX
from gargamel.logger import logger
from gargamel.nrpdb import NRPDB_FILENAME
from gargamel.nrpdb import RepresentativeFields
from gargamel.nrpdb import nrpdbs_from_file
from gargamel.scop import Keys
from gargamel.scop import SCOP_CLASSIFICATION_FILE
from gargamel.scop import hierarchy_sets_from_file
#from gargamel.whitelist import WHITELIST_FILENAME
#from gargamel.whitelist import whitelist_from_file

# a brief description of the purpose of this program
PROGRAM_DESCRIPTION = ('Generates multiple alignments using matt by leaving '
                       'one lower hierarchy level out of a target hierarchy '
                       'level in turn.')

# the filename to which to write the list of PDB files to align
PDBLIST_FILENAME = 'pdblist'

# the name of the executable which runs matt
MATT_EXECUTABLE = 'matt'

# the formats of the files which matt should generate as output
#OUTPUT_FORMATS = set(('fasta', 'pdb'))
OUTPUT_FORMATS = set(('ssi', 'pdb', 'fasta'))

# the text to write to a readme file for program's output directory
GENERAL_README = \
"""This directory contains one directory for each level of the hierarchy left
out during alignment of all chains in the hierarchy level with SCOP SUNID
{}."""

# the text to write to a readme file for each training output directory
ALIGNMENT_README = \
"""This directory contains the multiple alignment FASTA file and PDB file
generated by matt when run on all chains in SCOP hierarchy level with sunid {},
except for those in the SCOP hierarchy level whose sunid is the name of this
directory."""

def write_general_readme(filename, config):
    with open(filename, 'w') as f:
        f.write(GENERAL_README.format(config['target_level']) + '\n')
        f.write('\n')
        f.write('program config:\n')
        for k, v in config.iteritems():
            f.write('  ' + k + ' = ' + str(v) + '\n')

def write_alignment_readme(filename, training_cmd, training_pdbids, config):
    with open(filename, 'w') as f:
        f.write(ALIGNMENT_README.format(config['target_level']) + '\n')
        f.write('\n')
        f.write('PDBIDs used for alignment:\n')
        f.writelines(('  ' + pdbid + '\n' for pdbid in training_pdbids))
        f.write('alignment command:\n')
        f.write('  ' + str(training_cmd) + '\n')
        f.write('program config:\n')
        for k, v in config.iteritems():
            f.write('  ' + k + ' = ' + str(v) + '\n')

def to_pdb_filename(pdb_dir, pdbid):
    """Generates a string containing a filename constructed from the specified
    PDB id.

    For example, if the pdbid is '11ba', then the returned string will be
    something like '<pdb_dir>/1b/pdb11ba.ent.gz'. We expect that all PDB files
    look like this example, and are all gzipped files.

    """
    pdbid = str(pdbid)
    if len(pdbid) != 4:
        raise ValueError('PDB ID must be a string of length 4, but was length '
                         + len(pdbid))
    return os.path.join(pdb_dir, pdbid[1:3], 'pdb' + pdbid + '.ent.gz')

# create a parser for command-line arguments, with a description of the purpose
# of this program
argparser = MultipleAlignmentArgumentParser(PROGRAM_DESCRIPTION)

# parsed_args = argparser.parse_args()
parsed_args = argparser.parse_args()

# get the values from the command-line arguments
output_dir = parsed_args.outputdir.rstrip('/')  # remove trailing slash
target_level = parsed_args.target_level
target_sunid = parsed_args.target_sunid
pdb_dir = parsed_args.pdbdir.rstrip('/')  # remove trailing slash
representative_field = parsed_args.repfield

# determine the amount of logging info to output
if parsed_args.verbose:
    from logging import DEBUG
    from gargamel.logger import console_handler
    console_handler.setLevel(DEBUG)

# summary of the program configuration
config = {'output_dir' : output_dir,
          'pdb_dir' : pdb_dir,
          'representative_field' : representative_field,
          'target_level' : target_level,
          'target_sunid' : target_sunid}
logger.debug('Program configuration: ' + str(config))

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training

nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# get all the records from the SCOP classification file
logger.debug('Getting PDB IDs from SCOP Classification file...')
hierarchy = hierarchy_sets_from_file(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('hierarchy: ' + str(hierarchy))
if len(hierarchy) == 0:
    logger.critical('Nothing in hierarchy for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid))
    sys.exit(1)

# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# create the output directory if it doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.debug('...it doesn\'t so we create it')
    os.mkdir(output_dir)

# write a README file to the output directory
logger.debug('Writing a top-level README...')
write_general_readme(os.path.join(output_dir, 'README'), config)

# generate the names of each of the PDB files on which to train
logger.debug('Generating filenames...')
num_chains = 0
pdb_filenames = {}

used_pdbids = set()

# TODO should get more specific sequences (including chain and range) and pull those appropriately.


for hierarchy_level_sunid, pdbids in hierarchy.iteritems():
    for pdbid_tuple in pdbids:
        query_pdbid = ':'.join(pdbid_tuple)
        query_base_pdbid = pdbid_tuple[0]
        query_pdb_chain = pdbid_tuple[1]
        logger.debug('  PDB ID: ' + str(query_pdbid))
        # only add this pdb file if it is in the whitelist and is not already
        # represented by a redundant chain
        #if pdbid in whitelist and pdbid in nrpdbs:
        if query_pdbid in nrpdbs:
            pdbid = nrpdbs[query_pdbid]
            base_pdbid, pdb_chain = pdbid.split(':')

            if pdbid in used_pdbids:
              # logger.debug(pdbid + ' already used!')
              continue # if we've already added this chain, skip


            used_pdbids.add(pdbid)
                # generate the filename of this pdb file plus its chain
            filename = to_pdb_filename(pdb_dir, base_pdbid) + ':' + pdb_chain
            # add that filename to the set of pdb filenames (plus chains)
            if hierarchy_level_sunid in pdb_filenames:
                logger.debug('    adding to set for current hierarchy '
                             'level sunid')
                logger.debug('   which is ' + str(hierarchy_level_sunid))
                pdb_filenames[hierarchy_level_sunid].add(filename)
            else:
                logger.debug('    creating set for current hierarchy '
                             'level sunid')
                pdb_filenames[hierarchy_level_sunid] = set((filename, ))

# determine which pdb filenames are to be used for consensus alignment for each
# hierarchy level sunid to be left out
logger.debug('Determining pdb filenames for each hierarchy level sunid to be'
             'left out...')
all_others = {}
for hierarchy_level_sunid in pdb_filenames:
    logger.debug('  current hierarchy level sunid: '
                 + str(hierarchy_level_sunid))
    for hierarchy_level_sunid2, filenames2 in pdb_filenames.iteritems():
        logger.debug('    hierarchy level sunid 2: '
                     + str(hierarchy_level_sunid2))
        if hierarchy_level_sunid != hierarchy_level_sunid2:
            logger.debug('      does not equal current hierarchy_level_sunid')
            if hierarchy_level_sunid in all_others:
                logger.debug('      adding to set for current'
                             ' hierarchy_level_sunid')
                all_others[hierarchy_level_sunid].update(filenames2)
            else:
                logger.debug('      creating set for current hierarchy level '
                             'sunid')
                all_others[hierarchy_level_sunid] = set(filenames2)

# iterate over each hierarchy level sunid to query
logger.debug('Iterating over query hierarchy level sunid '
             + str(all_others.keys()))
for hierarchy_level_sunid, filenames in all_others.iteritems():

    logger.debug('Current hierarchy level sunid: '
                 + str(hierarchy_level_sunid))
    logger.debug('  contains ' + str(len(filenames)) + ' filenames:')
    # for filename in filenames:
    #     logger.debug('    ' + filename)

    # create an output directory if it doesn't exist yet
    logger.debug('Checking whether hierarchy level sunid output directory'
                 ' exists...')
    level_output_dir = os.path.join(output_dir, str(hierarchy_level_sunid))
    if not os.path.isdir(level_output_dir):
        logger.debug('  it doesn\'t so we create it: ' + level_output_dir)
        os.mkdir(level_output_dir)

    # write the list of PDB files on which to train
    logger.debug('Writing list of PDB files on which to train...')
    pdblist_filename = os.path.join(level_output_dir, PDBLIST_FILENAME)
    limit = 30
    count = 0
    with open(pdblist_filename, 'w') as pdblist_file:
        for name in filenames:
            count += 1
            logger.debug('considering name: ' + name)
            if count > limit:
              logger.debug('SKIPPING!')
              continue
            logger.debug('writing')
            pdblist_file.write(name + '\n')
    logger.debug('  written to file ' + pdblist_filename)

    # generate multiple alignments in Stockholm format using matt
    # TODO sometimes matt segfaults; capture output from stdout and stderr
    logger.debug('Running matt alignment...')
    matt_cmd = [MATT_EXECUTABLE,
                '-o', os.path.join(level_output_dir, MATT_PREFIX),
                '-f', ','.join(OUTPUT_FORMATS), '-L', pdblist_filename]
    logger.debug('  ' + ' '.join(matt_cmd))
    return_code = subprocess.call(matt_cmd)
    logger.debug('  Return code: ' + str(return_code))

    # write a README file to the hierarchy level sunid directory
    logger.debug('Writing an alignment output README...')
    write_alignment_readme(os.path.join(level_output_dir, 'README'), matt_cmd,
                           filenames, config)