lib.sequence_alignment.needleman

1 ############################################################################### 2 # # 3 # Copyright (C) 2015 Edward d'Auvergne # 4 # # 5 # This file is part of the program relax (http://www.nmr-relax.com). # 6 # # 7 # This program is free software: you can redistribute it and/or modify # 8 # it under the terms of the GNU General Public License as published by # 9 # the Free Software Foundation, either version 3 of the License, or # 10 # (at your option) any later version. # 11 # # 12 # This program is distributed in the hope that it will be useful, # 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15 # GNU General Public License for more details. # 16 # # 17 # You should have received a copy of the GNU General Public License # 18 # along with this program. If not, see <http://www.gnu.org/licenses/>. # 19 # # 20 ############################################################################### 21 22 # Module docstring. 23 """Functions for implementing the Needleman-Wunsch sequence alignment algorithm.""" 24 25 # Python module imports. 26 from numpy import float32, int16, zeros 27 28 # relax module imports. 29 from lib.errors import RelaxError, RelaxFault 30 31 32 # Default scores. 33 SCORE_MATCH = 1 34 SCORE_MISMATCH = -1 35 SCORE_GAP_PENALTY = 1 36 SCORES = zeros(3, float32) 37 38 # Indices. 39 TRACEBACK_DIAG = 0 40 TRACEBACK_TOP = 1 41 TRACEBACK_LEFT = 2 42 43

44 -def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0):

45 """Align two sequences using the Needleman-Wunsch algorithm using the EMBOSS logic for extensions. 46 47 This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}. The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. 48 49 50 @param sequence1: The first sequence. 51 @type sequence1: str 52 @param sequence2: The second sequence. 53 @type sequence2: str 54 @keyword sub_matrix: The substitution matrix to use to determine the penalties. 55 @type sub_matrix: numpy rank-2 int array 56 @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. 57 @type sub_seq: str 58 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. 59 @type gap_open_penalty: float 60 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. 61 @type gap_extend_penalty: float 62 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. 63 @type end_gap_open_penalty: float 64 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. 65 @type end_gap_extend_penalty: float 66 @return: The alignment score, two alignment strings and the gap matrix. 67 @rtype: float, str, str, numpy rank-2 int array 68 """ 69 70 # The sequence lengths. 71 M = len(sequence1) 72 N = len(sequence2) 73 74 # Sanity check. 75 for i in range(M): 76 if sequence1[i] not in sub_seq: 77 raise RelaxError("The residue '%s' from the first sequence cannot be found in the substitution matrix residues '%s'." % (sequence1[i], sub_seq)) 78 for j in range(N): 79 if sequence2[j] not in sub_seq: 80 raise RelaxError("The residue '%s' from the second sequence cannot be found in the substitution matrix residues '%s'." % (sequence2[j], sub_seq)) 81 82 # Calculate the scoring and traceback matrices. 83 matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) 84 85 # Generate the alignment. 86 score = 0.0 87 i = M - 1 88 j = N - 1 89 alignment1 = "" 90 alignment2 = "" 91 while 1: 92 # Top. 93 if j < 0 or traceback_matrix[i, j] == TRACEBACK_TOP: 94 alignment1 += sequence1[i] 95 alignment2 += '-' 96 i -= 1 97 98 # Left. 99 elif i < 0 or traceback_matrix[i, j] == TRACEBACK_LEFT: 100 alignment1 += '-' 101 alignment2 += sequence2[j] 102 j -= 1 103 104 # Diagonal. 105 elif traceback_matrix[i, j] == TRACEBACK_DIAG: 106 alignment1 += sequence1[i] 107 alignment2 += sequence2[j] 108 i -= 1 109 j -= 1 110 111 # Unknown behaviour. 112 else: 113 raise RelaxFault 114 115 # Termination. 116 if i < 0 and j < 0: 117 break 118 119 # Update the total score. 120 score += matrix[i, j] 121 122 # Reverse the alignments. 123 align1 = alignment1[::-1] 124 align2 = alignment2[::-1] 125 126 # Gap representation. 127 gaps = zeros((2, len(align1)), int16) 128 for l in range(len(align1)): 129 if align1[l] == '-': 130 gaps[0, l] = 1 131 if align2[l] == '-': 132 gaps[1, l] = 1 133 134 # Return the alignments and gap matrix. 135 return score, align1, align2, gaps

136 137

138 -def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, epsilon=1e-7):

139 """Construct the Needleman-Wunsch matrix for the given two sequences using the EMBOSS logic. 140 141 The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. 142 143 144 @param sequence1: The first sequence. 145 @type sequence1: str 146 @param sequence2: The second sequence. 147 @type sequence2: str 148 @keyword sub_matrix: The substitution matrix to use to determine the penalties. 149 @type sub_matrix: numpy rank-2 int16 array 150 @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. 151 @type sub_seq: str 152 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. 153 @type gap_open_penalty: float 154 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. 155 @type gap_extend_penalty: float 156 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. 157 @type end_gap_open_penalty: float 158 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. 159 @type end_gap_extend_penalty: float 160 @keyword epsilon: A value close to zero to determine if two numbers are the same, within this precision. 161 @type epsilon: float 162 @return: The Needleman-Wunsch matrix and traceback matrix. 163 @rtype: numpy rank-2 float32 array, numpy rank-2 int16 array 164 """ 165 166 # Initial scoring matrix. 167 M = len(sequence1) 168 N = len(sequence2) 169 matrix = zeros((M, N), float32) 170 171 # Initial gap matrices. 172 gap_matrix_vert = zeros((M, N), float32) 173 gap_matrix_hori = zeros((M, N), float32) 174 175 # Initial traceback matrix. 176 traceback_matrix = zeros((M, N), int16) 177 178 # Set up position [0, 0]. 179 matrix[0, 0] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[0])] 180 gap_matrix_vert[0, 0] = -gap_open_penalty 181 gap_matrix_hori[0, 0] = -gap_open_penalty 182 183 # Set up the first column. 184 for i in range(1, M): 185 # Substitution scores from the matrix. 186 matrix[i, 0] = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[0])] 187 188 # Gap scores. 189 score_gap_open = matrix[i-1, 0] - gap_open_penalty 190 score_gap_extend = gap_matrix_hori[i-1, 0] - gap_extend_penalty 191 192 # Update the vertical gap matrix. 193 if i < M-1: 194 gap_matrix_vert[i, 0] = -gap_open_penalty 195 196 # Update the horizontal gap matrix. 197 gap_matrix_hori[i, 0] = max(score_gap_open, score_gap_extend) 198 199 # Set up the first row. 200 for j in range(1, N): 201 # Substitution scores from the matrix. 202 matrix[0, j] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[j])] 203 204 # Gap scores. 205 score_gap_open = matrix[0, j-1] - gap_open_penalty 206 score_gap_extend = gap_matrix_vert[0, j-1] - gap_extend_penalty 207 208 # Update the vertical gap matrix. 209 gap_matrix_vert[0, j] = max(score_gap_open, score_gap_extend) 210 211 # Update the horizontal gap matrix. 212 if j < N-1: 213 gap_matrix_hori[0, j] = -gap_open_penalty 214 215 # Fill in the rest of the matrix. 216 for j in range(1, N): 217 for i in range(1, M): 218 # Substitution scores from the matrix. 219 sub_score = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[j])] 220 221 # The diagonal score. 222 SCORES[0] = matrix[i-1, j-1] 223 224 # The top score. 225 SCORES[1] = gap_matrix_vert[i-1, j-1] 226 227 # The left score. 228 SCORES[2] = gap_matrix_hori[i-1, j-1] 229 230 # Store the score. 231 matrix[i, j] = SCORES.max() + sub_score 232 233 # Horizontal gap scores. 234 if j == N-1: 235 score_gap_open = matrix[i-1, j] - end_gap_open_penalty 236 score_gap_extend = gap_matrix_hori[i-1, j] - end_gap_extend_penalty 237 else: 238 score_gap_open = max(matrix[i-1, j], gap_matrix_vert[i-1, j]) - gap_open_penalty 239 score_gap_extend = gap_matrix_hori[i-1, j] - gap_extend_penalty 240 gap_matrix_hori[i, j] = max(score_gap_open, score_gap_extend) 241 242 # Vertical gap scores. 243 if i == M-1: 244 score_gap_open = matrix[i, j-1] - end_gap_open_penalty 245 score_gap_extend = gap_matrix_vert[i, j-1] - end_gap_extend_penalty 246 else: 247 score_gap_open = max(matrix[i, j-1], gap_matrix_hori[i, j-1]) - gap_open_penalty 248 score_gap_extend = gap_matrix_vert[i, j-1] - gap_extend_penalty 249 gap_matrix_vert[i, j] = max(score_gap_open, score_gap_extend) 250 251 # Determine the best traceback path. 252 j = N - 1 253 i = M - 1 254 last_move = 0 255 while j >= 0 and i >= 0: 256 # The current indices. 257 curr_i = i 258 curr_j = j 259 260 # Choose the correct gap extension penalties. 261 left_gap_extend_penalty = gap_extend_penalty 262 top_gap_extend_penalty = gap_extend_penalty 263 if i == 0 or i == M-1: 264 left_gap_extend_penalty = end_gap_extend_penalty 265 if j == 0 or j == N-1: 266 top_gap_extend_penalty = end_gap_extend_penalty 267 268 # Extending the gap to the left. 269 if last_move == TRACEBACK_LEFT and abs(left_gap_extend_penalty - (gap_matrix_vert[i, j] - gap_matrix_vert[i, j+1])) < epsilon: 270 traceback_matrix[i, j] = TRACEBACK_LEFT 271 j -= 1 272 273 # Extending the gap to the top. 274 elif last_move== TRACEBACK_TOP and abs(top_gap_extend_penalty - (gap_matrix_hori[i, j] - gap_matrix_hori[i+1, j])) < epsilon: 275 traceback_matrix[i, j] = TRACEBACK_TOP 276 i -= 1 277 278 # Match. 279 elif matrix[i, j] >= gap_matrix_vert[i, j] and matrix[i, j] >= gap_matrix_hori[i, j]: 280 # Add another gap to the left. 281 if last_move == TRACEBACK_LEFT and abs(matrix[i, j] - gap_matrix_vert[i, j]) < epsilon: 282 traceback_matrix[i, j] = TRACEBACK_LEFT 283 j -= 1 284 285 # Add another gap to the top. 286 elif last_move == TRACEBACK_TOP and abs(matrix[i, j] - gap_matrix_hori[i, j]) < epsilon: 287 traceback_matrix[i, j] = TRACEBACK_TOP 288 i -= 1 289 290 # Take the match. 291 else: 292 traceback_matrix[i, j] = 0 293 i -= 1 294 j -= 1 295 296 # First gap to the left. 297 elif gap_matrix_vert[i, j] >= gap_matrix_hori[i, j] and j >= 0: 298 traceback_matrix[i, j] = TRACEBACK_LEFT 299 j -= 1 300 301 # First gap to the top. 302 elif i >= 0: 303 traceback_matrix[i, j] = TRACEBACK_TOP 304 i -= 1 305 306 # Store the last move. 307 last_move = traceback_matrix[curr_i, curr_j] 308 309 # Return the matrices. 310 return matrix, traceback_matrix

311

Source Code for Module lib.sequence_alignment.needleman_wunsch