Package lib :: Package sequence_alignment :: Module needleman_wunsch
[hide private]
[frames] | no frames]

Source Code for Module lib.sequence_alignment.needleman_wunsch

  1  ############################################################################### 
  2  #                                                                             # 
  3  # Copyright (C) 2015 Edward d'Auvergne                                        # 
  4  #                                                                             # 
  5  # This file is part of the program relax (http://www.nmr-relax.com).          # 
  6  #                                                                             # 
  7  # This program is free software: you can redistribute it and/or modify        # 
  8  # it under the terms of the GNU General Public License as published by        # 
  9  # the Free Software Foundation, either version 3 of the License, or           # 
 10  # (at your option) any later version.                                         # 
 11  #                                                                             # 
 12  # This program is distributed in the hope that it will be useful,             # 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of              # 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               # 
 15  # GNU General Public License for more details.                                # 
 16  #                                                                             # 
 17  # You should have received a copy of the GNU General Public License           # 
 18  # along with this program.  If not, see <http://www.gnu.org/licenses/>.       # 
 19  #                                                                             # 
 20  ############################################################################### 
 21   
 22  # Module docstring. 
 23  """Functions for implementing the Needleman-Wunsch sequence alignment algorithm.""" 
 24   
 25  # Python module imports. 
 26  from numpy import float32, int16, zeros 
 27   
 28  # relax module imports. 
 29  from lib.errors import RelaxError, RelaxFault 
 30   
 31   
 32  # Default scores. 
 33  SCORE_MATCH = 1 
 34  SCORE_MISMATCH = -1 
 35  SCORE_GAP_PENALTY = 1 
 36  SCORES = zeros(3, float32) 
 37   
 38  # Indices. 
 39  TRACEBACK_DIAG = 0 
 40  TRACEBACK_TOP = 1 
 41  TRACEBACK_LEFT = 2 
 42   
 43   
44 -def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0):
45 """Align two sequences using the Needleman-Wunsch algorithm using the EMBOSS logic for extensions. 46 47 This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}. The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. 48 49 50 @param sequence1: The first sequence. 51 @type sequence1: str 52 @param sequence2: The second sequence. 53 @type sequence2: str 54 @keyword sub_matrix: The substitution matrix to use to determine the penalties. 55 @type sub_matrix: numpy rank-2 int array 56 @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. 57 @type sub_seq: str 58 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. 59 @type gap_open_penalty: float 60 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. 61 @type gap_extend_penalty: float 62 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. 63 @type end_gap_open_penalty: float 64 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. 65 @type end_gap_extend_penalty: float 66 @return: The alignment score, two alignment strings and the gap matrix. 67 @rtype: float, str, str, numpy rank-2 int array 68 """ 69 70 # The sequence lengths. 71 M = len(sequence1) 72 N = len(sequence2) 73 74 # Sanity check. 75 for i in range(M): 76 if sequence1[i] not in sub_seq: 77 raise RelaxError("The residue '%s' from the first sequence cannot be found in the substitution matrix residues '%s'." % (sequence1[i], sub_seq)) 78 for j in range(N): 79 if sequence2[j] not in sub_seq: 80 raise RelaxError("The residue '%s' from the second sequence cannot be found in the substitution matrix residues '%s'." % (sequence2[j], sub_seq)) 81 82 # Calculate the scoring and traceback matrices. 83 matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) 84 85 # Generate the alignment. 86 score = 0.0 87 i = M - 1 88 j = N - 1 89 alignment1 = "" 90 alignment2 = "" 91 while 1: 92 # Top. 93 if j < 0 or traceback_matrix[i, j] == TRACEBACK_TOP: 94 alignment1 += sequence1[i] 95 alignment2 += '-' 96 i -= 1 97 98 # Left. 99 elif i < 0 or traceback_matrix[i, j] == TRACEBACK_LEFT: 100 alignment1 += '-' 101 alignment2 += sequence2[j] 102 j -= 1 103 104 # Diagonal. 105 elif traceback_matrix[i, j] == TRACEBACK_DIAG: 106 alignment1 += sequence1[i] 107 alignment2 += sequence2[j] 108 i -= 1 109 j -= 1 110 111 # Unknown behaviour. 112 else: 113 raise RelaxFault 114 115 # Termination. 116 if i < 0 and j < 0: 117 break 118 119 # Update the total score. 120 score += matrix[i, j] 121 122 # Reverse the alignments. 123 align1 = alignment1[::-1] 124 align2 = alignment2[::-1] 125 126 # Gap representation. 127 gaps = zeros((2, len(align1)), int16) 128 for l in range(len(align1)): 129 if align1[l] == '-': 130 gaps[0, l] = 1 131 if align2[l] == '-': 132 gaps[1, l] = 1 133 134 # Return the alignments and gap matrix. 135 return score, align1, align2, gaps
136 137
138 -def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, epsilon=1e-7):
139 """Construct the Needleman-Wunsch matrix for the given two sequences using the EMBOSS logic. 140 141 The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. 142 143 144 @param sequence1: The first sequence. 145 @type sequence1: str 146 @param sequence2: The second sequence. 147 @type sequence2: str 148 @keyword sub_matrix: The substitution matrix to use to determine the penalties. 149 @type sub_matrix: numpy rank-2 int16 array 150 @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. 151 @type sub_seq: str 152 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. 153 @type gap_open_penalty: float 154 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. 155 @type gap_extend_penalty: float 156 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. 157 @type end_gap_open_penalty: float 158 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. 159 @type end_gap_extend_penalty: float 160 @keyword epsilon: A value close to zero to determine if two numbers are the same, within this precision. 161 @type epsilon: float 162 @return: The Needleman-Wunsch matrix and traceback matrix. 163 @rtype: numpy rank-2 float32 array, numpy rank-2 int16 array 164 """ 165 166 # Initial scoring matrix. 167 M = len(sequence1) 168 N = len(sequence2) 169 matrix = zeros((M, N), float32) 170 171 # Initial gap matrices. 172 gap_matrix_vert = zeros((M, N), float32) 173 gap_matrix_hori = zeros((M, N), float32) 174 175 # Initial traceback matrix. 176 traceback_matrix = zeros((M, N), int16) 177 178 # Set up position [0, 0]. 179 matrix[0, 0] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[0])] 180 gap_matrix_vert[0, 0] = -gap_open_penalty 181 gap_matrix_hori[0, 0] = -gap_open_penalty 182 183 # Set up the first column. 184 for i in range(1, M): 185 # Substitution scores from the matrix. 186 matrix[i, 0] = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[0])] 187 188 # Gap scores. 189 score_gap_open = matrix[i-1, 0] - gap_open_penalty 190 score_gap_extend = gap_matrix_hori[i-1, 0] - gap_extend_penalty 191 192 # Update the vertical gap matrix. 193 if i < M-1: 194 gap_matrix_vert[i, 0] = -gap_open_penalty 195 196 # Update the horizontal gap matrix. 197 gap_matrix_hori[i, 0] = max(score_gap_open, score_gap_extend) 198 199 # Set up the first row. 200 for j in range(1, N): 201 # Substitution scores from the matrix. 202 matrix[0, j] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[j])] 203 204 # Gap scores. 205 score_gap_open = matrix[0, j-1] - gap_open_penalty 206 score_gap_extend = gap_matrix_vert[0, j-1] - gap_extend_penalty 207 208 # Update the vertical gap matrix. 209 gap_matrix_vert[0, j] = max(score_gap_open, score_gap_extend) 210 211 # Update the horizontal gap matrix. 212 if j < N-1: 213 gap_matrix_hori[0, j] = -gap_open_penalty 214 215 # Fill in the rest of the matrix. 216 for j in range(1, N): 217 for i in range(1, M): 218 # Substitution scores from the matrix. 219 sub_score = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[j])] 220 221 # The diagonal score. 222 SCORES[0] = matrix[i-1, j-1] 223 224 # The top score. 225 SCORES[1] = gap_matrix_vert[i-1, j-1] 226 227 # The left score. 228 SCORES[2] = gap_matrix_hori[i-1, j-1] 229 230 # Store the score. 231 matrix[i, j] = SCORES.max() + sub_score 232 233 # Horizontal gap scores. 234 if j == N-1: 235 score_gap_open = matrix[i-1, j] - end_gap_open_penalty 236 score_gap_extend = gap_matrix_hori[i-1, j] - end_gap_extend_penalty 237 else: 238 score_gap_open = max(matrix[i-1, j], gap_matrix_vert[i-1, j]) - gap_open_penalty 239 score_gap_extend = gap_matrix_hori[i-1, j] - gap_extend_penalty 240 gap_matrix_hori[i, j] = max(score_gap_open, score_gap_extend) 241 242 # Vertical gap scores. 243 if i == M-1: 244 score_gap_open = matrix[i, j-1] - end_gap_open_penalty 245 score_gap_extend = gap_matrix_vert[i, j-1] - end_gap_extend_penalty 246 else: 247 score_gap_open = max(matrix[i, j-1], gap_matrix_hori[i, j-1]) - gap_open_penalty 248 score_gap_extend = gap_matrix_vert[i, j-1] - gap_extend_penalty 249 gap_matrix_vert[i, j] = max(score_gap_open, score_gap_extend) 250 251 # Determine the best traceback path. 252 j = N - 1 253 i = M - 1 254 last_move = 0 255 while j >= 0 and i >= 0: 256 # The current indices. 257 curr_i = i 258 curr_j = j 259 260 # Choose the correct gap extension penalties. 261 left_gap_extend_penalty = gap_extend_penalty 262 top_gap_extend_penalty = gap_extend_penalty 263 if i == 0 or i == M-1: 264 left_gap_extend_penalty = end_gap_extend_penalty 265 if j == 0 or j == N-1: 266 top_gap_extend_penalty = end_gap_extend_penalty 267 268 # Extending the gap to the left. 269 if last_move == TRACEBACK_LEFT and abs(left_gap_extend_penalty - (gap_matrix_vert[i, j] - gap_matrix_vert[i, j+1])) < epsilon: 270 traceback_matrix[i, j] = TRACEBACK_LEFT 271 j -= 1 272 273 # Extending the gap to the top. 274 elif last_move== TRACEBACK_TOP and abs(top_gap_extend_penalty - (gap_matrix_hori[i, j] - gap_matrix_hori[i+1, j])) < epsilon: 275 traceback_matrix[i, j] = TRACEBACK_TOP 276 i -= 1 277 278 # Match. 279 elif matrix[i, j] >= gap_matrix_vert[i, j] and matrix[i, j] >= gap_matrix_hori[i, j]: 280 # Add another gap to the left. 281 if last_move == TRACEBACK_LEFT and abs(matrix[i, j] - gap_matrix_vert[i, j]) < epsilon: 282 traceback_matrix[i, j] = TRACEBACK_LEFT 283 j -= 1 284 285 # Add another gap to the top. 286 elif last_move == TRACEBACK_TOP and abs(matrix[i, j] - gap_matrix_hori[i, j]) < epsilon: 287 traceback_matrix[i, j] = TRACEBACK_TOP 288 i -= 1 289 290 # Take the match. 291 else: 292 traceback_matrix[i, j] = 0 293 i -= 1 294 j -= 1 295 296 # First gap to the left. 297 elif gap_matrix_vert[i, j] >= gap_matrix_hori[i, j] and j >= 0: 298 traceback_matrix[i, j] = TRACEBACK_LEFT 299 j -= 1 300 301 # First gap to the top. 302 elif i >= 0: 303 traceback_matrix[i, j] = TRACEBACK_TOP 304 i -= 1 305 306 # Store the last move. 307 last_move = traceback_matrix[curr_i, curr_j] 308 309 # Return the matrices. 310 return matrix, traceback_matrix
311