Author: bugman Date: Thu Jan 22 15:57:15 2015 New Revision: 27265 URL: http://svn.gna.org/viewcvs/relax?rev=27265&view=rev Log: Modified the Needleman-Wunsch sequence alignment algorithm. The previous attempt was buggy. The algorithm has been modified to match the logic of the GPL licenced EMBOSS software (http://emboss.sourceforge.net/) to allow for gap opening and extension penalties, as well as end penalties. No code was copied, rather the algorithm for creating the scoring and penalty matrices, as well as the traceback matrix. Modified: trunk/lib/sequence_alignment/align_protein.py trunk/lib/sequence_alignment/needleman_wunsch.py Modified: trunk/lib/sequence_alignment/align_protein.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/align_protein.py?rev=27265&r1=27264&r2=27265&view=diff ============================================================================== --- trunk/lib/sequence_alignment/align_protein.py (original) +++ trunk/lib/sequence_alignment/align_protein.py Thu Jan 22 15:57:15 2015 @@ -25,32 +25,30 @@ # Python module imports. from string import upper import sys -from warnings import warn # relax module imports. from lib.errors import RelaxError from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align from lib.sequence_alignment.substitution_matrices import BLOSUM62, BLOSUM62_SEQ -from lib.warnings import RelaxWarning -def align_pairwise(sequence1, sequence2, algorithm='NW70', matrix='BLOSUM62', gap_penalty=1.0, extend_penalty=1.0): +def align_pairwise(sequence1, sequence2, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0): """Align two protein sequences. - @param sequence1: The first protein sequence as one letter codes. - @type sequence1: str - @param sequence2: The second protein sequence as one letter codes. - @type sequence2: str - @keyword algorithm: The alignment algorithm to use. - @type algorithm: str - @keyword matrix: The substitution matrix to use. - @type matrix: str - @keyword gap_penalty: The penalty for introducing gaps, as a positive number. - @type gap_penalty: float - @keyword extend_penalty: The penalty for extending a gap, as a positive number. - @type extend_penalty: float - @return: The two alignment strings and the gap matrix. - @rtype: str, str, numpy rank-2 int array + @param sequence1: The first protein sequence as one letter codes. + @type sequence1: str + @param sequence2: The second protein sequence as one letter codes. + @type sequence2: str + @keyword algorithm: The alignment algorithm to use. + @type algorithm: str + @keyword matrix: The substitution matrix to use. + @type matrix: str + @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. + @type gap_open_penalty: float + @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. + @type gap_extend_penalty: float + @return: The two alignment strings and the gap matrix. + @rtype: str, str, numpy rank-2 int array """ # Checks. @@ -65,16 +63,11 @@ sequence1 = upper(sequence1) sequence2 = upper(sequence2) - # Turn off the extension penalty for algorithms not supporting it. - if extend_penalty != None and algorithm in ['NW70']: - warn(RelaxWarning("The gap extension penalty is not supported by the Needleman-Wunsch sequence alignment algorithm.")) - extend_penalty = None - # Initial printout. sys.stdout.write("\nPairwise protein alignment.\n") sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) - sys.stdout.write("%-30s %s\n" % ("Gap penalty:", gap_penalty)) - sys.stdout.write("%-30s %s\n" % ("Extend penalty:", extend_penalty)) + sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) + sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) sys.stdout.write("\n%-30s %s\n" % ("Input sequence 1:", sequence1)) sys.stdout.write("%-30s %s\n" % ("Input sequence 2:", sequence2)) @@ -85,7 +78,7 @@ # The alignment. if algorithm == 'NW70': - align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty) + align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty) # Final printout. sys.stdout.write("\n%-30s %s\n" % ("Aligned sequence 1:", align1)) @@ -100,4 +93,3 @@ # Return the results. return align1, align2, gaps - Modified: trunk/lib/sequence_alignment/needleman_wunsch.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/needleman_wunsch.py?rev=27265&r1=27264&r2=27265&view=diff ============================================================================== --- trunk/lib/sequence_alignment/needleman_wunsch.py (original) +++ trunk/lib/sequence_alignment/needleman_wunsch.py Thu Jan 22 15:57:15 2015 @@ -37,24 +37,26 @@ TRACEBACK_LEFT = 2 -def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_penalty=SCORE_GAP_PENALTY): - """Align two sequences using the Needleman-Wunsch algorithm. - - This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}. - - - @param sequence1: The first sequence. - @type sequence1: str - @param sequence2: The second sequence. - @type sequence2: str - @keyword sub_matrix: The substitution matrix to use to determine the penalties. - @type sub_matrix: numpy rank-2 int array - @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. - @type sub_seq: str - @keyword gap_penalty: The penalty for introducing gaps, as a positive number. - @type gap_penalty: float - @return: The two alignment strings and the gap matrix. - @rtype: str, str, numpy rank-2 int array +def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0): + """Align two sequences using the Needleman-Wunsch algorithm using the EMBOSS logic for extensions. + + This is implemented as described in the U{Wikipedia article on the Needleman-Wunsch algorithm <https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}. The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. + + + @param sequence1: The first sequence. + @type sequence1: str + @param sequence2: The second sequence. + @type sequence2: str + @keyword sub_matrix: The substitution matrix to use to determine the penalties. + @type sub_matrix: numpy rank-2 int array + @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. + @type sub_seq: str + @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. + @type gap_open_penalty: float + @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. + @type gap_extend_penalty: float + @return: The two alignment strings and the gap matrix. + @rtype: str, str, numpy rank-2 int array """ # The sequence lengths. @@ -62,7 +64,7 @@ N = len(sequence2) # Calculate the scoring and traceback matrices. - matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty) + matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty) # Generate the alignment. i = M - 1 @@ -109,60 +111,176 @@ return align1, align2, gaps -def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_penalty=SCORE_GAP_PENALTY): - """Construct the Needleman-Wunsch matrix for the given two sequences. - - @param sequence1: The first sequence. - @type sequence1: str - @param sequence2: The second sequence. - @type sequence2: str - @keyword sub_matrix: The substitution matrix to use to determine the penalties. - @type sub_matrix: numpy rank-2 int16 array - @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. - @type sub_seq: str - @keyword gap_penalty: The penalty for introducing gaps, as a positive number. - @type gap_penalty: float - @return: The Needleman-Wunsch matrix and traceback matrix. - @rtype: numpy rank-2 float32 array, numpy rank-2 int16 array +def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, epsilon=1e-7): + """Construct the Needleman-Wunsch matrix for the given two sequences using the EMBOSS logic. + + The algorithm has been modified to match that of U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and extension penalties, as well as end penalties. + + + @param sequence1: The first sequence. + @type sequence1: str + @param sequence2: The second sequence. + @type sequence2: str + @keyword sub_matrix: The substitution matrix to use to determine the penalties. + @type sub_matrix: numpy rank-2 int16 array + @keyword sub_seq: The one letter code sequence corresponding to the substitution matrix indices. + @type sub_seq: str + @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. + @type gap_open_penalty: float + @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. + @type gap_extend_penalty: float + @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. + @type end_gap_open_penalty: float + @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. + @type end_gap_extend_penalty: float + @keyword epsilon: A value close to zero to determine if two numbers are the same, within this precision. + @type epsilon: float + @return: The Needleman-Wunsch matrix and traceback matrix. + @rtype: numpy rank-2 float32 array, numpy rank-2 int16 array """ # Initial scoring matrix. - matrix = zeros((len(sequence1)+1, len(sequence2)+1), float32) - for i in range(1, len(matrix)): - matrix[i, 0] = -gap_penalty*i - for j in range(1, len(matrix[0])): - matrix[0, j] = -gap_penalty*j + M = len(sequence1) + N = len(sequence2) + matrix = zeros((M, N), float32) + + # Initial gap matrices. + gap_matrix_vert = zeros((M, N), float32) + gap_matrix_hori = zeros((M, N), float32) # Initial traceback matrix. - traceback_matrix = zeros((len(sequence1), len(sequence2)), int16) - - # Calculate the scores. - for i in range(1, len(matrix)): - for j in range(1, len(matrix[0])): - # Substitution scores when no matrix is provided. - if sub_matrix == None: - sub_score = SCORE_MISMATCH - if sequence1[i-1] == sequence2[j-1]: - sub_score = SCORE_MATCH - + traceback_matrix = zeros((M, N), int16) + + # Set up position [0, 0]. + matrix[0, 0] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[0])] + gap_matrix_vert[0, 0] = -gap_open_penalty + gap_matrix_hori[0, 0] = -gap_open_penalty + + # Set up the first column. + for i in range(1, M): + # Substitution scores from the matrix. + matrix[i, 0] = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[0])] + + # Gap scores. + score_gap_open = matrix[i-1, 0] - gap_open_penalty + score_gap_extend = gap_matrix_hori[i-1, 0] - gap_extend_penalty + + # Update the vertical gap matrix. + if i < M-1: + gap_matrix_vert[i, 0] = -gap_open_penalty + + # Update the horizontal gap matrix. + gap_matrix_hori[i, 0] = max(score_gap_open, score_gap_extend) + + # Set up the first row. + for j in range(1, N): + # Substitution scores from the matrix. + matrix[0, j] = sub_matrix[sub_seq.index(sequence1[0]), sub_seq.index(sequence2[j])] + + # Gap scores. + score_gap_open = matrix[0, j-1] - gap_open_penalty + score_gap_extend = gap_matrix_vert[0, j-1] - gap_extend_penalty + + # Update the vertical gap matrix. + gap_matrix_vert[0, j] = max(score_gap_open, score_gap_extend) + + # Update the horizontal gap matrix. + if j < N-1: + gap_matrix_hori[0, j] = -gap_open_penalty + + # Fill in the rest of the matrix. + for j in range(1, N): + for i in range(1, M): # Substitution scores from the matrix. + sub_score = sub_matrix[sub_seq.index(sequence1[i]), sub_seq.index(sequence2[j])] + + # The diagonal score. + SCORES[0] = matrix[i-1, j-1] + + # The top score. + SCORES[1] = gap_matrix_vert[i-1, j-1] + + # The left score. + SCORES[2] = gap_matrix_hori[i-1, j-1] + + # Store the score. + matrix[i, j] = SCORES.max() + sub_score + + # Horizontal gap scores. + if j == N-1: + score_gap_open = matrix[i-1, j] - end_gap_open_penalty + score_gap_extend = gap_matrix_hori[i-1, j] - end_gap_extend_penalty else: - sub_score = sub_matrix[sub_seq.index(sequence1[i-1]), sub_seq.index(sequence2[j-1])] - - # The diagonal score. - SCORES[0] = matrix[i-1, j-1] + sub_score - - # The top score. - SCORES[1] = matrix[i-1, j] - gap_penalty - - # The left score. - SCORES[2] = matrix[i, j-1] - gap_penalty - - # Store the best score. - matrix[i, j] = SCORES.max() - - # Update the traceback matrix. - traceback_matrix[i-1, j-1] = SCORES.argmax() - - # Return the matrix. + score_gap_open = max(matrix[i-1, j], gap_matrix_vert[i-1, j]) - gap_open_penalty + score_gap_extend = gap_matrix_hori[i-1, j] - gap_extend_penalty + gap_matrix_hori[i, j] = max(score_gap_open, score_gap_extend) + + # Vertical gap scores. + if i == M-1: + score_gap_open = matrix[i, j-1] - end_gap_open_penalty + score_gap_extend = gap_matrix_vert[i, j-1] - end_gap_extend_penalty + else: + score_gap_open = max(matrix[i, j-1], gap_matrix_hori[i, j-1]) - gap_open_penalty + score_gap_extend = gap_matrix_vert[i, j-1] - gap_extend_penalty + gap_matrix_vert[i, j] = max(score_gap_open, score_gap_extend) + + # Determine the best traceback path. + j = N - 1 + i = M - 1 + last_move = 0 + while j >= 0 and i >= 0: + # The current indices. + curr_i = i + curr_j = j + + # Choose the correct gap extension penalties. + left_gap_extend_penalty = gap_extend_penalty + top_gap_extend_penalty = gap_extend_penalty + if i == 0 or i == M-1: + left_gap_extend_penalty = end_gap_extend_penalty + if j == 0 or j == N-1: + top_gap_extend_penalty = end_gap_extend_penalty + + # Extending the gap to the left. + if last_move == TRACEBACK_LEFT and abs(left_gap_extend_penalty - (gap_matrix_vert[i, j] - gap_matrix_vert[i, j+1])) < epsilon: + traceback_matrix[i, j] = TRACEBACK_LEFT + j -= 1 + + # Extending the gap to the top. + elif last_move== TRACEBACK_TOP and abs(top_gap_extend_penalty - (gap_matrix_hori[i, j] - gap_matrix_hori[i+1, j])) < epsilon: + traceback_matrix[i, j] = TRACEBACK_TOP + i -= 1 + + # Match. + elif matrix[i, j] >= gap_matrix_vert[i, j] and matrix[i, j] >= gap_matrix_hori[i, j]: + # Add another gap to the left. + if last_move == TRACEBACK_LEFT and abs(matrix[i, j] - gap_matrix_vert[i, j]) < epsilon: + traceback_matrix[i, j] = TRACEBACK_LEFT + j -= 1 + + # Add another gap to the top. + elif last_move == TRACEBACK_TOP and abs(matrix[i, j] - gap_matrix_hori[i, j]) < epsilon: + traceback_matrix[i, j] = TRACEBACK_TOP + i -= 1 + + # Take the match. + else: + traceback_matrix[i, j] = 0 + i -= 1 + j -= 1 + + # First gap to the left. + elif gap_matrix_vert[i, j] >= gap_matrix_hori[i, j] and j >= 0: + traceback_matrix[i, j] = TRACEBACK_LEFT + j -= 1 + + # First gap to the top. + elif i >= 0: + traceback_matrix[i, j] = TRACEBACK_TOP + i -= 1 + + # Store the last move. + last_move = traceback_matrix[curr_i, curr_j] + + # Return the matrices. return matrix, traceback_matrix