-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathadd-libs2.py
More file actions
executable file
·78 lines (69 loc) · 3.01 KB
/
add-libs2.py
File metadata and controls
executable file
·78 lines (69 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
#=========================================================================
# This is OPEN SOURCE SOFTWARE governed by the Gnu General Public
# License (GPL) version 3, as described at www.opensource.org.
# Copyright (C)2021 William H. Majoros <bmajoros@alumni.duke.edu>
#=========================================================================
from __future__ import (absolute_import, division, print_function,
unicode_literals, generators, nested_scopes, with_statement)
from builtins import (bytes, dict, int, list, object, range, str, ascii,
chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
# The above imports should allow this program to run in both Python 2 and
# Python 3. You might need to update your version of module "future".
import sys
import ProgramName
import gzip
from FastaReader import FastaReader
from FastaWriter import FastaWriter
from Rex import Rex
rex=Rex()
fastaWriter=FastaWriter()
def getKeeperList(inFastaFile):
keep=set()
with gzip.open(inFastaFile,"rt") as IN:
for line in IN:
if(rex.find(">\S+ /coord=(\S+:\d+-\d+)",line)):
keep.add(rex[1])
return keep
def process(filestem,inFile,inDir,outDir,dnaReps,rnaReps):
inFastaFile=inDir+"/"+filestem+".fasta.gz"
#inCountsFile=inDir+"/"+filestem+"-counts.txt.gz"
outFastaFile=outDir+"/"+filestem+".fasta.gz"
outCountsFile=outDir+"/"+filestem+"-counts.txt.gz"
keep=getKeeperList(inFastaFile)
COUNTS=gzip.open(outCountsFile,"wt")
FASTA=gzip.open(outFastaFile,"wt")
print("DNA=",dnaReps," RNA=",rnaReps,sep="",file=COUNTS)
writeRecords(inFile,keep,COUNTS,FASTA)
COUNTS.close(); FASTA.close()
def writeRecords(inFile,keep,COUNTS,FASTA):
headerFields=None
nextId=1
with gzip.open(inFile,"rt") as IN:
header=IN.readline()
headerFields=header.rstrip().split()
endIdx=headerFields.index("end")
fcIdx=headerFields.index("log2FC")
seqIdx=headerFields.index("sequence")
for line in IN:
fields=line.rstrip().split()
key=fields[0]+":"+fields[1]+"-"+fields[2]
if key not in keep: continue
counts=fields[(endIdx+1):fcIdx]
libsizes=fields[(seqIdx+1):]
counts.extend(libsizes)
print("\t".join(counts),file=COUNTS)
seq=fields[seqIdx]
defline=">"+str(nextId)+" /coord="+key
fastaWriter.addToFasta(defline,seq,FASTA)
nextId+=1
#=========================================================================
# main()
#=========================================================================
if(len(sys.argv)!=6):
exit(ProgramName.get()+" <#DNA-reps> <#RNA-reps> <all-data.txt.gz> <in-dir> <out-dir>\n")
(dnaReps,rnaReps,inFile,inDir,outDir)=sys.argv[1:]
if(inDir==outDir): exit("Input and output directories cannot be the same")
process("train",inFile,inDir,outDir,dnaReps,rnaReps)
process("validation",inFile,inDir,outDir,dnaReps,rnaReps)
process("test",inFile,inDir,outDir,dnaReps,rnaReps)