Skip to content

Commit b9936e0

Browse files
author
dmoi
committed
fixing ft2 treebuilder ancestral reconstructions
1 parent 0a25e1c commit b9936e0

1 file changed

Lines changed: 48 additions & 27 deletions

File tree

foldtree2/ft2treebuilder.py

Lines changed: 48 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def __init__ ( self , model , mafftmat = None , submat = None , raxml_path= Non
3030
self.rev_replace_dict = { v:k for k,v in self.replace_dict.items() }
3131
self.replace_dict_ord = { ord(k):ord(v) for k,v in self.replace_dict.items() }
3232
self.raxml_path = raxml_path
33+
self.raxmlng_path = raxml_path
3334
if charmaps is None:
3435
self.rev_replace_dict_ord = { ord(v):ord(k) for k,v in self.replace_dict.items() }
3536
self.raxml_path = raxml_path
@@ -45,16 +46,31 @@ def __init__ ( self , model , mafftmat = None , submat = None , raxml_path= Non
4546
self.revmap = { i:c for i,c in enumerate(self.alphabet)}
4647
else:
4748
print('loading charmaps from', charmaps)
49+
50+
'''
51+
save format is {
52+
'pair_counts': pair_counts,
53+
'char_set': char_set,
54+
'char_position_map': char_position_map,
55+
'raxml_charset': raxml_charset,
56+
'raxml_char_position_map': raxml_char_position_map,
57+
'background_freq': background_freq,
58+
'convergence_history': monitor.history
59+
60+
}'''
61+
4862
with open(charmaps, 'rb') as f:
49-
pair_counts, char_set, char_position_map , raxml_charset, raxml_char_position_map = pickle.load(f)
50-
self.raxml_characters = raxml_charset
51-
self.alphabet = char_set
63+
data = pickle.load(f)
64+
self.raxml_characters = data['raxml_charset']
65+
self.alphabet = data['char_set']
5266
self.nchars = len(self.alphabet)
53-
self.map = char_position_map
54-
self.revmap = { v:k for k,v in char_position_map.items() }
55-
self.raxml_indices = raxml_char_position_map
56-
self.rev_raxml_indices = { v:k for k,v in raxml_char_position_map.items() }
57-
self.raxmlchars = raxml_charset
67+
self.map = data['char_position_map']
68+
self.revmap = { v:k for k,v in data['char_position_map'].items() }
69+
self.raxml_indices = data['raxml_char_position_map']
70+
self.rev_raxml_indices = { v:k for k,v in data['raxml_char_position_map'].items() }
71+
self.revmap_raxml = { v:k for k,v in data['raxml_char_position_map'].items() }
72+
self.raxmlchars = data['raxml_charset']
73+
5874
self.ordset = set([ ord(c) for c in self.alphabet ])
5975
#load pickled model
6076
self.model = model
@@ -184,6 +200,9 @@ def encode_structblob(self , blob = None , outfile = None ):
184200
structs = glob.glob(blob + '*.pdb')
185201
else:
186202
structs = glob.glob(blob)
203+
204+
#you need at least 4 structs for a tree
205+
assert len(structs) >= 4, f'Need at least 4 structures to build a tree, found {len(structs)}'
187206
if outfile == None:
188207
outfolder = '/'.join( blob.split('/')[:-1] )
189208
outfile = outfolder + 'encoded.fasta'
@@ -343,19 +362,21 @@ def run_raxml_ng(self, fasta_file, matrix_file, nsymbols, output_prefix , iterat
343362

344363
def run_raxml_ng_ancestral_struct(self, fasta_file, tree_file, matrix_file, nsymbols, output_prefix):
345364
model = 'MULTI'+str(nsymbols)+'_GTR{'+matrix_file+'}+I+G'
346-
if raxmlng_path == None:
347-
raxmlng_path = 'raxml-ng'
348-
raxml_cmd = raxmlng_path + ' --redo --ancestral --msa '+fasta_file+' --tree '+tree_file+' --model '+model+' --prefix '+output_prefix + ' --force perf_threads'
365+
if self.raxmlng_path == None:
366+
self.raxmlng_path = 'raxml-ng'
367+
raxml_cmd = self.raxmlng_path + ' --redo --ancestral --msa '+fasta_file+' --tree '+tree_file+' --model '+model+' --prefix '+output_prefix + ' --force perf_threads'
349368
print(raxml_cmd)
350369
subprocess.run(raxml_cmd, shell=True)
351-
return None
370+
return fasta_file.replace('raxml_aln.fasta' , 'raxml.ancestralStates')
352371

353-
def madroot( treefile , madroot_path = 'mad' ):
372+
def madroot( self, treefile , madroot_path = 'mad' ):
354373
mad_cmd = f'{madroot_path} {treefile} '
355374
subprocess.run(mad_cmd, shell=True)
356375
return treefile+'.rooted'
357376

358-
def ancestral2fasta( ancestral_file , outfasta ):
377+
def ancestral2fasta(self, ancestral_file , outfasta = None ):
378+
if outfasta is None:
379+
outfasta = ancestral_file + '.fasta'
359380
with open( outfasta , 'w') as g:
360381
with open( ancestral_file , 'r') as f:
361382
for l in f:
@@ -365,7 +386,7 @@ def ancestral2fasta( ancestral_file , outfasta ):
365386
g.write('>' + identifier + '\n' + seq + '\n')
366387
return outfasta
367388

368-
def ancestralfasta2df( outfasta ):
389+
def ancestralfasta2df(self, outfasta ):
369390
aln_data = {}
370391
with open(outfasta, 'r') as f:
371392
for line in f:
@@ -376,7 +397,7 @@ def ancestralfasta2df( outfasta ):
376397
aln_data[ID] += line.strip()
377398
ancestral_df = pd.DataFrame( aln_data.items() , columns=['protid', 'seq'] )
378399
#use rev map to convert back to ord
379-
ancestral_df['ord'] = ancestral_df.seq.map( lambda x: [ revmap_raxml[c] if c in revmap_raxml else '-' for c in x ] )
400+
ancestral_df['ord'] = ancestral_df.seq.map( lambda x: [ self.revmap_raxml[c] if c in self.revmap_raxml else '-' for c in x ] )
380401
return ancestral_df
381402

382403
def decoder_reconstruction( self, ords , verbose = False):
@@ -465,21 +486,21 @@ def structs2tree(self, structs , outdir = None , ancestral = False , raxml_itera
465486
if ancestral == True:
466487
#not tested yet
467488
ancestral_file = self.run_raxml_ng_ancestral_struct( alnfasta , treefile , self.submat , self.nchars , alnfasta.replace('.raxml_aln.fasta' , '') )
468-
ancestral_fasta = self.ancestral2fasta( ancestral_file , outfasta )
469-
ancestral_df = self.ancestralfasta2df( outfasta )
489+
ancestral_fasta = self.ancestral2fasta( ancestral_file )
490+
ancestral_df = self.ancestralfasta2df( ancestral_fasta )
470491
#decode the ancestral sequence
471492
ords = ancestral_df.ord.values
472-
for l in ords.shape[0]:
473-
res = decoder_reconstruction( ords[l] , verbose = verbose)
493+
for l in tqdm.tqdm(range(ords.shape[0]), desc='decoding ancestral sequences'):
494+
res = self.decoder_reconstruction( ords[l] , verbose = verbose)
474495
for key,item in res.items():
475496
ancestral_df.loc[l , key] = item
476497
#write the ancestral dataframe to a file
477-
ancestral_df.to_csv( outfasta.replace('.fasta' , '.csv') )
498+
ancestral_df.to_csv( ancestral_fasta.replace('.aastr.fasta' , '.csv') )
478499
#write out aastr to a fasta
479-
with open( outfasta.replace('.fasta' , '.aastr.fasta') , 'w') as f:
500+
with open( ancestral_fasta , 'w') as f:
480501
for i in ancestral_df.index:
481502
f.write('>' + i + '\n' + ancestral_df.loc[i].aastr + '\n')
482-
ancestral_fasta = outfasta.replace('.fasta' , '.aastr.fasta')
503+
ancestral_fasta = ancestral_fasta
483504
else:
484505
ancestral_fasta = None
485506
#return in dictionary form
@@ -591,7 +612,7 @@ def main():
591612
if len(sys.argv) == 1 or ('--help' in sys.argv) or ('-h' in sys.argv):
592613
print('No arguments provided. Use -h or --help for help.')
593614
print('Example command:')
594-
print(' python ft2treebuilder.py --model my_model --modeldir ./models --mafftmat ./models/my_model_mafftmat.mtx --submat ./models/my_model_submat.txt --charmaps ./models/my_model_pair_counts.pkl --structures "/path/to/structures/*.pdb" --outdir ./results --aapropcsv ./foldtree2/config/aaindex1.csv --ncores 8 --raxml_iterations 20 --raxmlpath raxml-ng --ancestral')
615+
print(' python ft2treebuilder.py --model ./models/my_model --structures "/path/to/structures/*.pdb" --outdir ./results --aapropcsv ./foldtree2/config/aaindex1.csv --ncores 8 --raxml_iterations 20 --ancestral')
595616
parser.print_help()
596617
sys.exit(0)
597618

@@ -637,11 +658,11 @@ def main():
637658
args.output_prefix += '_'
638659

639660
if args.mafftmat is None:
640-
args.mafftmat = os.path.join(modeldir, encoder_path.replace('.pt', '_mafftmat.mtx'))
661+
args.mafftmat = encoder_path.replace('.pt', '_mafftmat.mtx')
641662
if args.submat is None:
642-
args.submat = os.path.join(modeldir, encoder_path.replace('.pt', '_submat.txt'))
663+
args.submat = encoder_path.replace('.pt', '_submat.txt')
643664
if args.charmaps is None:
644-
args.charmaps = os.path.join(modeldir, encoder_path.replace('.pt', '_pair_counts.pkl'))
665+
args.charmaps = encoder_path.replace('.pt', '_pair_counts.pkl')
645666

646667

647668
# Create an instance of treebuilder

0 commit comments

Comments
 (0)