-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnwAlign.py
More file actions
139 lines (114 loc) · 5.11 KB
/
nwAlign.py
File metadata and controls
139 lines (114 loc) · 5.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
import sys
seq1_fasta = sys.argv[1] #Storing the fasta file1 (inputted as argument) in seq1_fasta
seq2_fasta = sys.argv[2] #Storing the fasta file2 (inputted as argument) in seq2_fasta
seq1 = "" #seq1 string is going to contain the sequence 1
seq2 = "" #seq2 string is going to contain the sequence 2
#Storing the sequence 1 in seq1:
with open(seq1_fasta) as seq1_fh:
for line in seq1_fh.readlines():
if line.startswith(">"):
continue
else:
seq1 += line
seq1 = seq1.rstrip("\n") #Stripping the trailing new line character
#Storing the sequence 2 in seq2:
with open(seq2_fasta) as seq2_fh:
for line in seq2_fh.readlines():
if line.startswith(">"):
continue
else:
seq2 += line
seq2 = seq2.rstrip("\n") #Stripping the trailing new line character
m = len(seq1) #Seq1 will be the vertical sequence to the left
n = len(seq2) #Seq2 will be the horizontal sequence on top
init_mat = [] #Initialised matrix
#Scoring system for match, mismatch and gap:
match = 1
mismatch = -1
gap = -1
#Initialising the matrix to 0 (Part1):
for i in range(m+1): #Adding +1 as one extra column needed to hold the initialised values
temp = []
for j in range(n+1): #Adding +1 as one extra column needed to hold the initialised values
temp.append(0)
init_mat.append(temp) #init_mat is a matrix of len(seq1)+1 rows and len(seq2)+1 columns containing 0's
for j in range(n+1):
init_mat[0][j] = gap*j #Multiplying 0,1,2... with the gap in order to initialise the matrix 1st row
for i in range(m+1):
init_mat[i][0] = gap*i #Multiplying 0,1,2... with the gap penatly in order to initilaise the matrix 1st column
#Matrix filling (Part2):
for i in range(1,m+1):
for j in range(1, n+1):
if seq1[i-1] == seq2[j-1]:
init_mat[i][j] = max(init_mat[i][j-1]+gap, init_mat[i-1][j]+gap, init_mat[i-1][j-1]+match)
else:
init_mat[i][j] = max(init_mat[i][j-1]+gap, init_mat[i-1][j]+gap, init_mat[i-1][j-1]+mismatch)
'''
#Visualise as a matrix
for row in init_mat:
for element in row:
print(element, end="\t")
print("\n")
'''
#Backtracking (Part3):
seq1_align = "" #The algined sequence (seq1) is going to be appended to this string
seq2_align = "" #The aligned sequence (seq2) is going to be appended to this string
score = 0
i = m #i = m = len(seq1)
j = n #j = n = len(seq2)
while (i>0 or j>0):
#Checking if it is a match. If it is a match, then append and jump to the diagonal value directly:
if seq1[i-1] == seq2[j-1]:
seq1_align += seq1[i-1]
seq2_align += seq2[j-1]
i -= 1
j -= 1
#If the sequence don't match:
elif seq1[i-1] != seq2[j-1]:
temp_list = [init_mat[i-1][j-1], init_mat[i-1][j], init_mat[i][j-1]] #Creating a temp_list in order to find the maximum values from top, diagonal and left in order to backtrack
#If the maximum value is the 0th indexed position, i.e., the diagonal value:
if max(temp_list) == temp_list[0]:
seq1_align += seq1[i-1]
seq2_align += seq2[j-1]
i -= 1
j -= 1
#If the maximum value is the 1st indexed position, i.e., the top value:
elif max(temp_list) == temp_list[1]:
seq1_align += seq1[i-1]
seq2_align += "-"
i -= 1
#If the maximum value is the 2nd indexed position, i.e., the left vlaue:
elif max(temp_list) == temp_list[-1]:
seq1_align += "-"
seq2_align += seq2[j-1]
j-=1
#If there is an error (somehow? just in case?), initialising the values of i and j in order for it to not turn into an infinite loop
else:
print("Error. Exit.")
i=0
j=0
seq1_align = seq1_align[::-1] #Reverse the string seq1_align
seq2_align = seq2_align[::-1] #Reverse the string seq2_align
#Storing the match, mismatch and gap symbols in match_string:
match_string = ""
for i in range(len(seq1_align)):
if seq1_align[i] == seq2_align[i]:
match_string += "|"
elif seq1_align[i] != seq2_align[i]:
if (seq1_align[i] == "-" or seq2_align[i] == "-"):
match_string += " "
else:
match_string += "*"
#Calculating the alignment score:
alignment_score = 0
for i in range(len(match_string)):
if match_string[i] == "|":
alignment_score += 1
elif (match_string[i] == "*" or match_string[i] == " "):
alignment_score += -1
#Printing out the final result:
print(seq1_align)
print(match_string)
print(seq2_align)
print("Alignment score:", alignment_score)