Author: bugman Date: Wed Jan 21 15:36:43 2015 New Revision: 27257 URL: http://svn.gna.org/viewcvs/relax?rev=27257&view=rev Log: Modification of the Needleman-Wunsch sequence alignment algorithm implementation. This is in the lib.sequence_alignment.needleman_wunsch functions. Scoring matrices are now supported, as well as a user supplied non-integer gap penalty. The algorithm for walking through the traceback matrix has been fixed for a bug under certain conditions. Modified: trunk/lib/sequence_alignment/needleman_wunsch.py Modified: trunk/lib/sequence_alignment/needleman_wunsch.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/needleman_wunsch.py?rev=27257&r1=27256&r2=27257&view=diff ============================================================================== --- trunk/lib/sequence_alignment/needleman_wunsch.py (original) +++ trunk/lib/sequence_alignment/needleman_wunsch.py Wed Jan 21 15:36:43 2015 @@ -23,7 +23,7 @@ """Functions for implementing the Needleman-Wunsch sequence alignment algorithm.""" # Python module imports. -from numpy import int16, zeros +from numpy import float32, int16, zeros # Default scores. SCORE_MATCH = 1 @@ -37,18 +37,24 @@ TRACEBACK_LEFT = 2 -def needleman_wunsch_align(sequence1, sequence2): +def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_penalty=SCORE_GAP_PENALTY): """Align two sequences using the Needleman-Wunsch algorithm. This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}. - @param sequence1: The first sequence. - @type sequence1: str - @param sequence2: The second sequence. - @type sequence2: str - @return: The two alignment strings and the gap matrix. - @rtype: str, str, numpy rank-2 int array + @param sequence1: The first sequence. + @type sequence1: str + @param sequence2: The second sequence. + @type sequence2: str + @keyword sub_matrix: The substitution matrix to use to determine the penalties. + @type sub_matrix: numpy rank-2 int array + @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. + @type sub_seq: str + @keyword gap_penalty: The penalty for introducing gaps, as a positive number. + @type gap_penalty: float + @return: The two alignment strings and the gap matrix. + @rtype: str, str, numpy rank-2 int array """ # The sequence lengths. @@ -56,14 +62,18 @@ N = len(sequence2) # Calculate the scoring and traceback matrices. - matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2) + matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty) # Generate the alignment. i = M - 1 j = N - 1 alignment1 = "" alignment2 = "" - while i >= 0 or j >= 0: + while 1: + # Termination. + if i < 0 or j < 0: + break + # Diagonal. if traceback_matrix[i, j] == TRACEBACK_DIAG: alignment1 += sequence1[i] @@ -99,23 +109,29 @@ return align1, align2, gaps -def needleman_wunsch_matrix(sequence1, sequence2): +def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_penalty=SCORE_GAP_PENALTY): """Construct the Needleman-Wunsch matrix for the given two sequences. - @param sequence1: The first sequence. - @type sequence1: str - @param sequence2: The second sequence. - @type sequence2: str - @return: The Needleman-Wunsch matrix and traceback matrix. - @rtype: numpy rank-2 int array, numpy rank-2 int array + @param sequence1: The first sequence. + @type sequence1: str + @param sequence2: The second sequence. + @type sequence2: str + @keyword sub_matrix: The substitution matrix to use to determine the penalties. + @type sub_matrix: numpy rank-2 int16 array + @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. + @type sub_seq: str + @keyword gap_penalty: The penalty for introducing gaps, as a positive number. + @type gap_penalty: float + @return: The Needleman-Wunsch matrix and traceback matrix. + @rtype: numpy rank-2 float32 array, numpy rank-2 int16 array """ # Initial scoring matrix. - matrix = zeros((len(sequence1)+1, len(sequence2)+1), int16) - for i in range(len(matrix)): - matrix[i, 0] = -i - for j in range(len(matrix[0])): - matrix[0, j] = -j + matrix = zeros((len(sequence1)+1, len(sequence2)+1), float32) + for i in range(1, len(matrix)): + matrix[i, 0] = -gap_penalty*i + for j in range(1, len(matrix[0])): + matrix[0, j] = -gap_penalty*j # Initial traceback matrix. traceback_matrix = zeros((len(sequence1), len(sequence2)), int16) @@ -123,19 +139,24 @@ # Calculate the scores. for i in range(1, len(matrix)): for j in range(1, len(matrix[0])): - # Substitution scores. - sub_score = SCORE_MISMATCH - if sequence1[i-1] == sequence2[j-1]: - sub_score = SCORE_MATCH + # Substitution scores when no matrix is provided. + if sub_matrix == None: + sub_score = SCORE_MISMATCH + if sequence1[i-1] == sequence2[j-1]: + sub_score = SCORE_MATCH + + # Substitution scores from the matrix. + else: + sub_score = sub_matrix[sub_seq.index(sequence1[i-1]), sub_seq.index(sequence2[j-1])] # The diagonal score. SCORES[0] = matrix[i-1, j-1] + sub_score # The top score. - SCORES[1] = matrix[i-1, j] + SCORE_GAP_PENALTY + SCORES[1] = matrix[i-1, j] - gap_penalty # The left score. - SCORES[2] = matrix[i, j-1] + SCORE_GAP_PENALTY + SCORES[2] = matrix[i, j-1] - gap_penalty # Store the best score. matrix[i, j] = SCORES.max()