@@ -30,6 +30,7 @@ def __init__ ( self , model , mafftmat = None , submat = None , raxml_path= Non
3030 self .rev_replace_dict = { v :k for k ,v in self .replace_dict .items () }
3131 self .replace_dict_ord = { ord (k ):ord (v ) for k ,v in self .replace_dict .items () }
3232 self .raxml_path = raxml_path
33+ self .raxmlng_path = raxml_path
3334 if charmaps is None :
3435 self .rev_replace_dict_ord = { ord (v ):ord (k ) for k ,v in self .replace_dict .items () }
3536 self .raxml_path = raxml_path
@@ -45,16 +46,31 @@ def __init__ ( self , model , mafftmat = None , submat = None , raxml_path= Non
4546 self .revmap = { i :c for i ,c in enumerate (self .alphabet )}
4647 else :
4748 print ('loading charmaps from' , charmaps )
49+
50+ '''
51+ save format is {
52+ 'pair_counts': pair_counts,
53+ 'char_set': char_set,
54+ 'char_position_map': char_position_map,
55+ 'raxml_charset': raxml_charset,
56+ 'raxml_char_position_map': raxml_char_position_map,
57+ 'background_freq': background_freq,
58+ 'convergence_history': monitor.history
59+
60+ }'''
61+
4862 with open (charmaps , 'rb' ) as f :
49- pair_counts , char_set , char_position_map , raxml_charset , raxml_char_position_map = pickle .load (f )
50- self .raxml_characters = raxml_charset
51- self .alphabet = char_set
63+ data = pickle .load (f )
64+ self .raxml_characters = data [ ' raxml_charset' ]
65+ self .alphabet = data [ ' char_set' ]
5266 self .nchars = len (self .alphabet )
53- self .map = char_position_map
54- self .revmap = { v :k for k ,v in char_position_map .items () }
55- self .raxml_indices = raxml_char_position_map
56- self .rev_raxml_indices = { v :k for k ,v in raxml_char_position_map .items () }
57- self .raxmlchars = raxml_charset
67+ self .map = data ['char_position_map' ]
68+ self .revmap = { v :k for k ,v in data ['char_position_map' ].items () }
69+ self .raxml_indices = data ['raxml_char_position_map' ]
70+ self .rev_raxml_indices = { v :k for k ,v in data ['raxml_char_position_map' ].items () }
71+ self .revmap_raxml = { v :k for k ,v in data ['raxml_char_position_map' ].items () }
72+ self .raxmlchars = data ['raxml_charset' ]
73+
5874 self .ordset = set ([ ord (c ) for c in self .alphabet ])
5975 #load pickled model
6076 self .model = model
@@ -184,6 +200,9 @@ def encode_structblob(self , blob = None , outfile = None ):
184200 structs = glob .glob (blob + '*.pdb' )
185201 else :
186202 structs = glob .glob (blob )
203+
204+ #you need at least 4 structs for a tree
205+ assert len (structs ) >= 4 , f'Need at least 4 structures to build a tree, found { len (structs )} '
187206 if outfile == None :
188207 outfolder = '/' .join ( blob .split ('/' )[:- 1 ] )
189208 outfile = outfolder + 'encoded.fasta'
@@ -343,19 +362,21 @@ def run_raxml_ng(self, fasta_file, matrix_file, nsymbols, output_prefix , iterat
343362
344363 def run_raxml_ng_ancestral_struct (self , fasta_file , tree_file , matrix_file , nsymbols , output_prefix ):
345364 model = 'MULTI' + str (nsymbols )+ '_GTR{' + matrix_file + '}+I+G'
346- if raxmlng_path == None :
347- raxmlng_path = 'raxml-ng'
348- raxml_cmd = raxmlng_path + ' --redo --ancestral --msa ' + fasta_file + ' --tree ' + tree_file + ' --model ' + model + ' --prefix ' + output_prefix + ' --force perf_threads'
365+ if self . raxmlng_path == None :
366+ self . raxmlng_path = 'raxml-ng'
367+ raxml_cmd = self . raxmlng_path + ' --redo --ancestral --msa ' + fasta_file + ' --tree ' + tree_file + ' --model ' + model + ' --prefix ' + output_prefix + ' --force perf_threads'
349368 print (raxml_cmd )
350369 subprocess .run (raxml_cmd , shell = True )
351- return None
370+ return fasta_file . replace ( 'raxml_aln.fasta' , 'raxml.ancestralStates' )
352371
353- def madroot ( treefile , madroot_path = 'mad' ):
372+ def madroot ( self , treefile , madroot_path = 'mad' ):
354373 mad_cmd = f'{ madroot_path } { treefile } '
355374 subprocess .run (mad_cmd , shell = True )
356375 return treefile + '.rooted'
357376
358- def ancestral2fasta ( ancestral_file , outfasta ):
377+ def ancestral2fasta (self , ancestral_file , outfasta = None ):
378+ if outfasta is None :
379+ outfasta = ancestral_file + '.fasta'
359380 with open ( outfasta , 'w' ) as g :
360381 with open ( ancestral_file , 'r' ) as f :
361382 for l in f :
@@ -365,7 +386,7 @@ def ancestral2fasta( ancestral_file , outfasta ):
365386 g .write ('>' + identifier + '\n ' + seq + '\n ' )
366387 return outfasta
367388
368- def ancestralfasta2df ( outfasta ):
389+ def ancestralfasta2df (self , outfasta ):
369390 aln_data = {}
370391 with open (outfasta , 'r' ) as f :
371392 for line in f :
@@ -376,7 +397,7 @@ def ancestralfasta2df( outfasta ):
376397 aln_data [ID ] += line .strip ()
377398 ancestral_df = pd .DataFrame ( aln_data .items () , columns = ['protid' , 'seq' ] )
378399 #use rev map to convert back to ord
379- ancestral_df ['ord' ] = ancestral_df .seq .map ( lambda x : [ revmap_raxml [c ] if c in revmap_raxml else '-' for c in x ] )
400+ ancestral_df ['ord' ] = ancestral_df .seq .map ( lambda x : [ self . revmap_raxml [c ] if c in self . revmap_raxml else '-' for c in x ] )
380401 return ancestral_df
381402
382403 def decoder_reconstruction ( self , ords , verbose = False ):
@@ -465,21 +486,21 @@ def structs2tree(self, structs , outdir = None , ancestral = False , raxml_itera
465486 if ancestral == True :
466487 #not tested yet
467488 ancestral_file = self .run_raxml_ng_ancestral_struct ( alnfasta , treefile , self .submat , self .nchars , alnfasta .replace ('.raxml_aln.fasta' , '' ) )
468- ancestral_fasta = self .ancestral2fasta ( ancestral_file , outfasta )
469- ancestral_df = self .ancestralfasta2df ( outfasta )
489+ ancestral_fasta = self .ancestral2fasta ( ancestral_file )
490+ ancestral_df = self .ancestralfasta2df ( ancestral_fasta )
470491 #decode the ancestral sequence
471492 ords = ancestral_df .ord .values
472- for l in ords .shape [0 ]:
473- res = decoder_reconstruction ( ords [l ] , verbose = verbose )
493+ for l in tqdm . tqdm ( range ( ords .shape [0 ]), desc = 'decoding ancestral sequences' ) :
494+ res = self . decoder_reconstruction ( ords [l ] , verbose = verbose )
474495 for key ,item in res .items ():
475496 ancestral_df .loc [l , key ] = item
476497 #write the ancestral dataframe to a file
477- ancestral_df .to_csv ( outfasta .replace ('.fasta' , '.csv' ) )
498+ ancestral_df .to_csv ( ancestral_fasta .replace ('.aastr .fasta' , '.csv' ) )
478499 #write out aastr to a fasta
479- with open ( outfasta . replace ( '.fasta' , '.aastr.fasta' ) , 'w' ) as f :
500+ with open ( ancestral_fasta , 'w' ) as f :
480501 for i in ancestral_df .index :
481502 f .write ('>' + i + '\n ' + ancestral_df .loc [i ].aastr + '\n ' )
482- ancestral_fasta = outfasta . replace ( '.fasta' , '.aastr.fasta' )
503+ ancestral_fasta = ancestral_fasta
483504 else :
484505 ancestral_fasta = None
485506 #return in dictionary form
@@ -591,7 +612,7 @@ def main():
591612 if len (sys .argv ) == 1 or ('--help' in sys .argv ) or ('-h' in sys .argv ):
592613 print ('No arguments provided. Use -h or --help for help.' )
593614 print ('Example command:' )
594- print (' python ft2treebuilder.py --model my_model --modeldir ./models --mafftmat ./models/my_model_mafftmat.mtx --submat ./models/my_model_submat.txt --charmaps ./models/my_model_pair_counts.pkl -- structures "/path/to/structures/*.pdb" --outdir ./results --aapropcsv ./foldtree2/config/aaindex1.csv --ncores 8 --raxml_iterations 20 --raxmlpath raxml-ng --ancestral' )
615+ print (' python ft2treebuilder.py --model ./models/my_model --structures "/path/to/structures/*.pdb" --outdir ./results --aapropcsv ./foldtree2/config/aaindex1.csv --ncores 8 --raxml_iterations 20 --ancestral' )
595616 parser .print_help ()
596617 sys .exit (0 )
597618
@@ -637,11 +658,11 @@ def main():
637658 args .output_prefix += '_'
638659
639660 if args .mafftmat is None :
640- args .mafftmat = os . path . join ( modeldir , encoder_path .replace ('.pt' , '_mafftmat.mtx' ) )
661+ args .mafftmat = encoder_path .replace ('.pt' , '_mafftmat.mtx' )
641662 if args .submat is None :
642- args .submat = os . path . join ( modeldir , encoder_path .replace ('.pt' , '_submat.txt' ) )
663+ args .submat = encoder_path .replace ('.pt' , '_submat.txt' )
643664 if args .charmaps is None :
644- args .charmaps = os . path . join ( modeldir , encoder_path .replace ('.pt' , '_pair_counts.pkl' ) )
665+ args .charmaps = encoder_path .replace ('.pt' , '_pair_counts.pkl' )
645666
646667
647668 # Create an instance of treebuilder
0 commit comments