1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23  """Functions for implementing the Needleman-Wunsch sequence alignment algorithm.""" 
 24   
 25   
 26  from numpy import float32, int16, zeros 
 27   
 28   
 29  from lib.errors import RelaxError, RelaxFault 
 30   
 31   
 32   
 33  SCORE_MATCH = 1 
 34  SCORE_MISMATCH = -1 
 35  SCORE_GAP_PENALTY = 1 
 36  SCORES = zeros(3, float32) 
 37   
 38   
 39  TRACEBACK_DIAG = 0 
 40  TRACEBACK_TOP = 1 
 41  TRACEBACK_LEFT = 2 
 42   
 43   
 45      """Align two sequences using the Needleman-Wunsch algorithm using the EMBOSS logic for extensions. 
 46   
 47      This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}.  The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. 
 48   
 49   
 50      @param sequence1:                   The first sequence. 
 51      @type sequence1:                    str 
 52      @param sequence2:                   The second sequence. 
 53      @type sequence2:                    str 
 54      @keyword sub_matrix:                The substitution matrix to use to determine the penalties. 
 55      @type sub_matrix:                   numpy rank-2 int array 
 56      @keyword sub_seq:                   The one letter code sequence corresponding to the substitution matrix indices. 
 57      @type sub_seq:                      str 
 58      @keyword gap_open_penalty:          The penalty for introducing gaps, as a positive number. 
 59      @type gap_open_penalty:             float 
 60      @keyword gap_extend_penalty:        The penalty for extending a gap, as a positive number. 
 61      @type gap_extend_penalty:           float 
 62      @keyword end_gap_open_penalty:      The optional penalty for opening a gap at the end of a sequence. 
 63      @type end_gap_open_penalty:         float 
 64      @keyword end_gap_extend_penalty:    The optional penalty for extending a gap at the end of a sequence. 
 65      @type end_gap_extend_penalty:       float 
 66      @return:                            The alignment score, two alignment strings and the gap matrix. 
 67      @rtype:                             float, str, str, numpy rank-2 int array 
 68      """ 
 69   
 70       
 71      M = len(sequence1) 
 72      N = len(sequence2) 
 73   
 74       
 75      for i in range(M): 
 76          if sequence1[i] not in sub_seq: 
 77              raise RelaxError("The residue '%s' from the first sequence cannot be found in the substitution matrix residues '%s'." % (sequence1[i], sub_seq)) 
 78      for j in range(N): 
 79          if sequence2[j] not in sub_seq: 
 80              raise RelaxError("The residue '%s' from the second sequence cannot be found in the substitution matrix residues '%s'." % (sequence2[j], sub_seq)) 
 81   
 82       
 83      matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) 
 84   
 85       
 86      score = 0.0 
 87      i = M - 1 
 88      j = N - 1 
 89      alignment1 = "" 
 90      alignment2 = "" 
 91      while 1: 
 92           
 93          if j < 0 or traceback_matrix[i, j] == TRACEBACK_TOP: 
 94              alignment1 += sequence1[i] 
 95              alignment2 += '-' 
 96              i -= 1 
 97   
 98           
 99          elif i < 0 or traceback_matrix[i, j] == TRACEBACK_LEFT: 
100              alignment1 += '-' 
101              alignment2 += sequence2[j] 
102              j -= 1 
103   
104           
105          elif traceback_matrix[i, j] == TRACEBACK_DIAG: 
106              alignment1 += sequence1[i] 
107              alignment2 += sequence2[j] 
108              i -= 1 
109              j -= 1 
110   
111           
112          else: 
113              raise RelaxFault 
114   
115           
116          if i < 0 and j < 0: 
117              break 
118   
119           
120          score += matrix[i, j] 
121   
122       
123      align1 = alignment1[::-1] 
124      align2 = alignment2[::-1] 
125   
126       
127      gaps = zeros((2, len(align1)), int16) 
128      for l in range(len(align1)): 
129          if align1[l] == '-': 
130              gaps[0, l] = 1 
131          if align2[l] == '-': 
132              gaps[1, l] = 1 
133   
134       
135      return score, align1, align2, gaps 
 136   
137   
138 -def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, epsilon=1e-7): 
 139      """Construct the Needleman-Wunsch matrix for the given two sequences using the EMBOSS logic. 
140   
141      The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. 
142   
143   
144      @param sequence1:                   The first sequence. 
145      @type sequence1:                    str 
146      @param sequence2:                   The second sequence. 
147      @type sequence2:                    str 
148      @keyword sub_matrix:                The substitution matrix to use to determine the penalties. 
149      @type sub_matrix:                   numpy rank-2 int16 array 
150      @keyword sub_seq:                   The one letter code sequence corresponding to the substitution matrix indices. 
151      @type sub_seq:                      str 
152      @keyword gap_open_penalty:          The penalty for introducing gaps, as a positive number. 
153      @type gap_open_penalty:             float 
154      @keyword gap_extend_penalty:        The penalty for extending a gap, as a positive number. 
155      @type gap_extend_penalty:           float 
156      @keyword end_gap_open_penalty:      The optional penalty for opening a gap at the end of a sequence. 
157      @type end_gap_open_penalty:         float 
158      @keyword end_gap_extend_penalty:    The optional penalty for extending a gap at the end of a sequence. 
159      @type end_gap_extend_penalty:       float 
160      @keyword epsilon:                   A value close to zero to determine if two numbers are the same, within this precision. 
161      @type epsilon:                      float 
162      @return:                            The Needleman-Wunsch matrix and traceback matrix. 
163      @rtype:                             numpy rank-2 float32 array, numpy rank-2 int16 array 
164      """ 
165   
166       
167      M = len(sequence1) 
168      N = len(sequence2) 
169      matrix = zeros((M, N), float32) 
170   
171       
172      gap_matrix_vert = zeros((M, N), float32) 
173      gap_matrix_hori = zeros((M, N), float32) 
174   
175       
176      traceback_matrix = zeros((M, N), int16) 
177   
178       
179      matrix[0, 0] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[0])] 
180      gap_matrix_vert[0, 0] = -gap_open_penalty 
181      gap_matrix_hori[0, 0] = -gap_open_penalty 
182   
183       
184      for i in range(1, M): 
185           
186          matrix[i, 0] = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[0])] 
187   
188           
189          score_gap_open = matrix[i-1, 0] - gap_open_penalty 
190          score_gap_extend = gap_matrix_hori[i-1, 0] - gap_extend_penalty 
191   
192           
193          if i < M-1: 
194              gap_matrix_vert[i, 0] = -gap_open_penalty 
195   
196           
197          gap_matrix_hori[i, 0] = max(score_gap_open, score_gap_extend) 
198   
199       
200      for j in range(1, N): 
201           
202          matrix[0, j] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[j])] 
203   
204           
205          score_gap_open = matrix[0, j-1] - gap_open_penalty 
206          score_gap_extend = gap_matrix_vert[0, j-1] - gap_extend_penalty 
207   
208           
209          gap_matrix_vert[0, j] = max(score_gap_open, score_gap_extend) 
210   
211           
212          if j < N-1: 
213              gap_matrix_hori[0, j] = -gap_open_penalty 
214   
215       
216      for j in range(1, N): 
217          for i in range(1, M): 
218               
219              sub_score = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[j])] 
220   
221               
222              SCORES[0] = matrix[i-1, j-1] 
223   
224               
225              SCORES[1] = gap_matrix_vert[i-1, j-1] 
226   
227               
228              SCORES[2] = gap_matrix_hori[i-1, j-1] 
229   
230               
231              matrix[i, j] = SCORES.max() + sub_score 
232   
233               
234              if j == N-1: 
235                  score_gap_open = matrix[i-1, j] - end_gap_open_penalty 
236                  score_gap_extend = gap_matrix_hori[i-1, j] - end_gap_extend_penalty 
237              else: 
238                  score_gap_open = max(matrix[i-1, j], gap_matrix_vert[i-1, j]) - gap_open_penalty 
239                  score_gap_extend = gap_matrix_hori[i-1, j] - gap_extend_penalty 
240              gap_matrix_hori[i, j] = max(score_gap_open, score_gap_extend) 
241   
242               
243              if i == M-1: 
244                  score_gap_open = matrix[i, j-1] - end_gap_open_penalty 
245                  score_gap_extend = gap_matrix_vert[i, j-1] - end_gap_extend_penalty 
246              else: 
247                  score_gap_open = max(matrix[i, j-1], gap_matrix_hori[i, j-1]) - gap_open_penalty 
248                  score_gap_extend = gap_matrix_vert[i, j-1] - gap_extend_penalty 
249              gap_matrix_vert[i, j] = max(score_gap_open, score_gap_extend) 
250   
251       
252      j = N - 1 
253      i = M - 1 
254      last_move = 0 
255      while j >= 0 and i >= 0: 
256           
257          curr_i = i 
258          curr_j = j 
259   
260           
261          left_gap_extend_penalty = gap_extend_penalty 
262          top_gap_extend_penalty = gap_extend_penalty 
263          if i == 0 or i == M-1: 
264              left_gap_extend_penalty = end_gap_extend_penalty 
265          if j == 0 or j == N-1: 
266              top_gap_extend_penalty = end_gap_extend_penalty 
267   
268           
269          if last_move == TRACEBACK_LEFT and abs(left_gap_extend_penalty - (gap_matrix_vert[i, j] - gap_matrix_vert[i, j+1])) < epsilon: 
270              traceback_matrix[i, j] = TRACEBACK_LEFT 
271              j -= 1 
272   
273           
274          elif last_move== TRACEBACK_TOP and abs(top_gap_extend_penalty - (gap_matrix_hori[i, j] - gap_matrix_hori[i+1, j])) < epsilon: 
275              traceback_matrix[i, j] = TRACEBACK_TOP 
276              i -= 1 
277   
278           
279          elif matrix[i, j] >= gap_matrix_vert[i, j] and matrix[i, j] >= gap_matrix_hori[i, j]: 
280               
281              if last_move == TRACEBACK_LEFT and abs(matrix[i, j] - gap_matrix_vert[i, j]) < epsilon: 
282                  traceback_matrix[i, j] = TRACEBACK_LEFT 
283                  j -= 1 
284   
285               
286              elif last_move == TRACEBACK_TOP and abs(matrix[i, j] - gap_matrix_hori[i, j]) < epsilon: 
287                  traceback_matrix[i, j] = TRACEBACK_TOP 
288                  i -= 1 
289   
290               
291              else: 
292                  traceback_matrix[i, j] = 0 
293                  i -= 1 
294                  j -= 1 
295   
296           
297          elif gap_matrix_vert[i, j] >= gap_matrix_hori[i, j] and j >= 0: 
298              traceback_matrix[i, j] = TRACEBACK_LEFT 
299              j -= 1 
300   
301           
302          elif i >= 0: 
303              traceback_matrix[i, j] = TRACEBACK_TOP 
304              i -= 1 
305   
306           
307          last_move = traceback_matrix[curr_i, curr_j] 
308   
309       
310      return matrix, traceback_matrix 
 311