1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """Functions for implementing the Needleman-Wunsch sequence alignment algorithm."""
24
25
26 from numpy import float32, int16, zeros
27
28
29 from lib.errors import RelaxError, RelaxFault
30
31
32
33 SCORE_MATCH = 1
34 SCORE_MISMATCH = -1
35 SCORE_GAP_PENALTY = 1
36 SCORES = zeros(3, float32)
37
38
39 TRACEBACK_DIAG = 0
40 TRACEBACK_TOP = 1
41 TRACEBACK_LEFT = 2
42
43
45 """Align two sequences using the Needleman-Wunsch algorithm using the EMBOSS logic for extensions.
46
47 This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}. The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties.
48
49
50 @param sequence1: The first sequence.
51 @type sequence1: str
52 @param sequence2: The second sequence.
53 @type sequence2: str
54 @keyword sub_matrix: The substitution matrix to use to determine the penalties.
55 @type sub_matrix: numpy rank-2 int array
56 @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices.
57 @type sub_seq: str
58 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number.
59 @type gap_open_penalty: float
60 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number.
61 @type gap_extend_penalty: float
62 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence.
63 @type end_gap_open_penalty: float
64 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence.
65 @type end_gap_extend_penalty: float
66 @return: The alignment score, two alignment strings and the gap matrix.
67 @rtype: float, str, str, numpy rank-2 int array
68 """
69
70
71 M = len(sequence1)
72 N = len(sequence2)
73
74
75 for i in range(M):
76 if sequence1[i] not in sub_seq:
77 raise RelaxError("The residue '%s' from the first sequence cannot be found in the substitution matrix residues '%s'." % (sequence1[i], sub_seq))
78 for j in range(N):
79 if sequence2[j] not in sub_seq:
80 raise RelaxError("The residue '%s' from the second sequence cannot be found in the substitution matrix residues '%s'." % (sequence2[j], sub_seq))
81
82
83 matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty)
84
85
86 score = 0.0
87 i = M - 1
88 j = N - 1
89 alignment1 = ""
90 alignment2 = ""
91 while 1:
92
93 if j < 0 or traceback_matrix[i, j] == TRACEBACK_TOP:
94 alignment1 += sequence1[i]
95 alignment2 += '-'
96 i -= 1
97
98
99 elif i < 0 or traceback_matrix[i, j] == TRACEBACK_LEFT:
100 alignment1 += '-'
101 alignment2 += sequence2[j]
102 j -= 1
103
104
105 elif traceback_matrix[i, j] == TRACEBACK_DIAG:
106 alignment1 += sequence1[i]
107 alignment2 += sequence2[j]
108 i -= 1
109 j -= 1
110
111
112 else:
113 raise RelaxFault
114
115
116 if i < 0 and j < 0:
117 break
118
119
120 score += matrix[i, j]
121
122
123 align1 = alignment1[::-1]
124 align2 = alignment2[::-1]
125
126
127 gaps = zeros((2, len(align1)), int16)
128 for l in range(len(align1)):
129 if align1[l] == '-':
130 gaps[0, l] = 1
131 if align2[l] == '-':
132 gaps[1, l] = 1
133
134
135 return score, align1, align2, gaps
136
137
138 -def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, epsilon=1e-7):
139 """Construct the Needleman-Wunsch matrix for the given two sequences using the EMBOSS logic.
140
141 The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties.
142
143
144 @param sequence1: The first sequence.
145 @type sequence1: str
146 @param sequence2: The second sequence.
147 @type sequence2: str
148 @keyword sub_matrix: The substitution matrix to use to determine the penalties.
149 @type sub_matrix: numpy rank-2 int16 array
150 @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices.
151 @type sub_seq: str
152 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number.
153 @type gap_open_penalty: float
154 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number.
155 @type gap_extend_penalty: float
156 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence.
157 @type end_gap_open_penalty: float
158 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence.
159 @type end_gap_extend_penalty: float
160 @keyword epsilon: A value close to zero to determine if two numbers are the same, within this precision.
161 @type epsilon: float
162 @return: The Needleman-Wunsch matrix and traceback matrix.
163 @rtype: numpy rank-2 float32 array, numpy rank-2 int16 array
164 """
165
166
167 M = len(sequence1)
168 N = len(sequence2)
169 matrix = zeros((M, N), float32)
170
171
172 gap_matrix_vert = zeros((M, N), float32)
173 gap_matrix_hori = zeros((M, N), float32)
174
175
176 traceback_matrix = zeros((M, N), int16)
177
178
179 matrix[0, 0] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[0])]
180 gap_matrix_vert[0, 0] = -gap_open_penalty
181 gap_matrix_hori[0, 0] = -gap_open_penalty
182
183
184 for i in range(1, M):
185
186 matrix[i, 0] = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[0])]
187
188
189 score_gap_open = matrix[i-1, 0] - gap_open_penalty
190 score_gap_extend = gap_matrix_hori[i-1, 0] - gap_extend_penalty
191
192
193 if i < M-1:
194 gap_matrix_vert[i, 0] = -gap_open_penalty
195
196
197 gap_matrix_hori[i, 0] = max(score_gap_open, score_gap_extend)
198
199
200 for j in range(1, N):
201
202 matrix[0, j] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[j])]
203
204
205 score_gap_open = matrix[0, j-1] - gap_open_penalty
206 score_gap_extend = gap_matrix_vert[0, j-1] - gap_extend_penalty
207
208
209 gap_matrix_vert[0, j] = max(score_gap_open, score_gap_extend)
210
211
212 if j < N-1:
213 gap_matrix_hori[0, j] = -gap_open_penalty
214
215
216 for j in range(1, N):
217 for i in range(1, M):
218
219 sub_score = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[j])]
220
221
222 SCORES[0] = matrix[i-1, j-1]
223
224
225 SCORES[1] = gap_matrix_vert[i-1, j-1]
226
227
228 SCORES[2] = gap_matrix_hori[i-1, j-1]
229
230
231 matrix[i, j] = SCORES.max() + sub_score
232
233
234 if j == N-1:
235 score_gap_open = matrix[i-1, j] - end_gap_open_penalty
236 score_gap_extend = gap_matrix_hori[i-1, j] - end_gap_extend_penalty
237 else:
238 score_gap_open = max(matrix[i-1, j], gap_matrix_vert[i-1, j]) - gap_open_penalty
239 score_gap_extend = gap_matrix_hori[i-1, j] - gap_extend_penalty
240 gap_matrix_hori[i, j] = max(score_gap_open, score_gap_extend)
241
242
243 if i == M-1:
244 score_gap_open = matrix[i, j-1] - end_gap_open_penalty
245 score_gap_extend = gap_matrix_vert[i, j-1] - end_gap_extend_penalty
246 else:
247 score_gap_open = max(matrix[i, j-1], gap_matrix_hori[i, j-1]) - gap_open_penalty
248 score_gap_extend = gap_matrix_vert[i, j-1] - gap_extend_penalty
249 gap_matrix_vert[i, j] = max(score_gap_open, score_gap_extend)
250
251
252 j = N - 1
253 i = M - 1
254 last_move = 0
255 while j >= 0 and i >= 0:
256
257 curr_i = i
258 curr_j = j
259
260
261 left_gap_extend_penalty = gap_extend_penalty
262 top_gap_extend_penalty = gap_extend_penalty
263 if i == 0 or i == M-1:
264 left_gap_extend_penalty = end_gap_extend_penalty
265 if j == 0 or j == N-1:
266 top_gap_extend_penalty = end_gap_extend_penalty
267
268
269 if last_move == TRACEBACK_LEFT and abs(left_gap_extend_penalty - (gap_matrix_vert[i, j] - gap_matrix_vert[i, j+1])) < epsilon:
270 traceback_matrix[i, j] = TRACEBACK_LEFT
271 j -= 1
272
273
274 elif last_move== TRACEBACK_TOP and abs(top_gap_extend_penalty - (gap_matrix_hori[i, j] - gap_matrix_hori[i+1, j])) < epsilon:
275 traceback_matrix[i, j] = TRACEBACK_TOP
276 i -= 1
277
278
279 elif matrix[i, j] >= gap_matrix_vert[i, j] and matrix[i, j] >= gap_matrix_hori[i, j]:
280
281 if last_move == TRACEBACK_LEFT and abs(matrix[i, j] - gap_matrix_vert[i, j]) < epsilon:
282 traceback_matrix[i, j] = TRACEBACK_LEFT
283 j -= 1
284
285
286 elif last_move == TRACEBACK_TOP and abs(matrix[i, j] - gap_matrix_hori[i, j]) < epsilon:
287 traceback_matrix[i, j] = TRACEBACK_TOP
288 i -= 1
289
290
291 else:
292 traceback_matrix[i, j] = 0
293 i -= 1
294 j -= 1
295
296
297 elif gap_matrix_vert[i, j] >= gap_matrix_hori[i, j] and j >= 0:
298 traceback_matrix[i, j] = TRACEBACK_LEFT
299 j -= 1
300
301
302 elif i >= 0:
303 traceback_matrix[i, j] = TRACEBACK_TOP
304 i -= 1
305
306
307 last_move = traceback_matrix[curr_i, curr_j]
308
309
310 return matrix, traceback_matrix
311