r27265 - in /trunk/lib/sequence_alignment: align_protein.py needleman_wunsch.py -- January 22, 2015

Author: bugman
Date: Thu Jan 22 15:57:15 2015
New Revision: 27265

URL: http://svn.gna.org/viewcvs/relax?rev=27265&view=rev
Log:
Modified the Needleman-Wunsch sequence alignment algorithm.

The previous attempt was buggy.  The algorithm has been modified to match the 
logic of the GPL
licenced EMBOSS software (http://emboss.sourceforge.net/) to allow for gap 
opening and extension
penalties, as well as end penalties.  No code was copied, rather the 
algorithm for creating the
scoring and penalty matrices, as well as the traceback matrix.


Modified:
    trunk/lib/sequence_alignment/align_protein.py
    trunk/lib/sequence_alignment/needleman_wunsch.py

Modified: trunk/lib/sequence_alignment/align_protein.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/align_protein.py?rev=27265&r1=27264&r2=27265&view=diff
==============================================================================
--- trunk/lib/sequence_alignment/align_protein.py       (original)
+++ trunk/lib/sequence_alignment/align_protein.py       Thu Jan 22 15:57:15 
2015
@@ -25,32 +25,30 @@
 # Python module imports.
 from string import upper
 import sys
-from warnings import warn
 
 # relax module imports.
 from lib.errors import RelaxError
 from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align
 from lib.sequence_alignment.substitution_matrices import BLOSUM62, 
BLOSUM62_SEQ
-from lib.warnings import RelaxWarning
 
 
-def align_pairwise(sequence1, sequence2, algorithm='NW70', 
matrix='BLOSUM62', gap_penalty=1.0, extend_penalty=1.0):
+def align_pairwise(sequence1, sequence2, algorithm='NW70', 
matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0):
     """Align two protein sequences.
 
-    @param sequence1:           The first protein sequence as one letter 
codes.
-    @type sequence1:            str
-    @param sequence2:           The second protein sequence as one letter 
codes.
-    @type sequence2:            str
-    @keyword algorithm:         The alignment algorithm to use.
-    @type algorithm:            str
-    @keyword matrix:            The substitution matrix to use.
-    @type matrix:               str
-    @keyword gap_penalty:       The penalty for introducing gaps, as a 
positive number.
-    @type gap_penalty:          float
-    @keyword extend_penalty:    The penalty for extending a gap, as a 
positive number.
-    @type extend_penalty:       float
-    @return:                    The two alignment strings and the gap matrix.
-    @rtype:                     str, str, numpy rank-2 int array
+    @param sequence1:               The first protein sequence as one letter 
codes.
+    @type sequence1:                str
+    @param sequence2:               The second protein sequence as one 
letter codes.
+    @type sequence2:                str
+    @keyword algorithm:             The alignment algorithm to use.
+    @type algorithm:                str
+    @keyword matrix:                The substitution matrix to use.
+    @type matrix:                   str
+    @keyword gap_open_penalty:      The penalty for introducing gaps, as a 
positive number.
+    @type gap_open_penalty:         float
+    @keyword gap_extend_penalty:    The penalty for extending a gap, as a 
positive number.
+    @type gap_extend_penalty:       float
+    @return:                        The two alignment strings and the gap 
matrix.
+    @rtype:                         str, str, numpy rank-2 int array
     """
 
     # Checks.
@@ -65,16 +63,11 @@
     sequence1 = upper(sequence1)
     sequence2 = upper(sequence2)
 
-    # Turn off the extension penalty for algorithms not supporting it.
-    if extend_penalty != None and algorithm in ['NW70']:
-        warn(RelaxWarning("The gap extension penalty is not supported by the 
Needleman-Wunsch sequence alignment algorithm."))
-        extend_penalty = None
-
     # Initial printout.
     sys.stdout.write("\nPairwise protein alignment.\n")
     sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix))
-    sys.stdout.write("%-30s %s\n" % ("Gap penalty:", gap_penalty))
-    sys.stdout.write("%-30s %s\n" % ("Extend penalty:", extend_penalty))
+    sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", 
gap_open_penalty))
+    sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", 
gap_extend_penalty))
     sys.stdout.write("\n%-30s %s\n" % ("Input sequence 1:", sequence1))
     sys.stdout.write("%-30s %s\n" % ("Input sequence 2:", sequence2))
 
@@ -85,7 +78,7 @@
 
     # The alignment.
     if algorithm == 'NW70':
-        align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, 
sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty)
+        align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, 
sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, 
gap_extend_penalty=gap_extend_penalty)
 
     # Final printout.
     sys.stdout.write("\n%-30s %s\n" % ("Aligned sequence 1:", align1))
@@ -100,4 +93,3 @@
 
     # Return the results.
     return align1, align2, gaps
-

Modified: trunk/lib/sequence_alignment/needleman_wunsch.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/needleman_wunsch.py?rev=27265&r1=27264&r2=27265&view=diff
==============================================================================
--- trunk/lib/sequence_alignment/needleman_wunsch.py    (original)
+++ trunk/lib/sequence_alignment/needleman_wunsch.py    Thu Jan 22 15:57:15 
2015
@@ -37,24 +37,26 @@
 TRACEBACK_LEFT = 2
 
 
-def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, 
sub_seq=None, gap_penalty=SCORE_GAP_PENALTY):
-    """Align two sequences using the Needleman-Wunsch algorithm.
-
-    This is implemented as described in the U{Wikipedia article on the 
Needleman-Wunsch algorithm 
<https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}.
-
-
-    @param sequence1:           The first sequence.
-    @type sequence1:            str
-    @param sequence2:           The second sequence.
-    @type sequence2:            str
-    @keyword sub_matrix:        The substitution matrix to use to determine 
the penalties.
-    @type sub_matrix:           numpy rank-2 int array
-    @keyword sub_seq:           The one letter code sequence corresponding 
to the substitution matrix indices.
-    @type sub_seq:              str
-    @keyword gap_penalty:       The penalty for introducing gaps, as a 
positive number.
-    @type gap_penalty:          float
-    @return:                    The two alignment strings and the gap matrix.
-    @rtype:                     str, str, numpy rank-2 int array
+def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, 
sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0):
+    """Align two sequences using the Needleman-Wunsch algorithm using the 
EMBOSS logic for extensions.
+
+    This is implemented as described in the U{Wikipedia article on the 
Needleman-Wunsch algorithm 
<https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}.  The 
algorithm has been modified to match that of 
U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and 
extension penalties, as well as end penalties.
+
+
+    @param sequence1:               The first sequence.
+    @type sequence1:                str
+    @param sequence2:               The second sequence.
+    @type sequence2:                str
+    @keyword sub_matrix:            The substitution matrix to use to 
determine the penalties.
+    @type sub_matrix:               numpy rank-2 int array
+    @keyword sub_seq:               The one letter code sequence 
corresponding to the substitution matrix indices.
+    @type sub_seq:                  str
+    @keyword gap_open_penalty:      The penalty for introducing gaps, as a 
positive number.
+    @type gap_open_penalty:         float
+    @keyword gap_extend_penalty:    The penalty for extending a gap, as a 
positive number.
+    @type gap_extend_penalty:       float
+    @return:                        The two alignment strings and the gap 
matrix.
+    @rtype:                         str, str, numpy rank-2 int array
     """
 
     # The sequence lengths.
@@ -62,7 +64,7 @@
     N = len(sequence2)
 
     # Calculate the scoring and traceback matrices.
-    matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, 
sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty)
+    matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, 
sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, 
gap_extend_penalty=gap_extend_penalty)
 
     # Generate the alignment.
     i = M - 1
@@ -109,60 +111,176 @@
     return align1, align2, gaps
 
 
-def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, 
sub_seq=None, gap_penalty=SCORE_GAP_PENALTY):
-    """Construct the Needleman-Wunsch matrix for the given two sequences.
-
-    @param sequence1:           The first sequence.
-    @type sequence1:            str
-    @param sequence2:           The second sequence.
-    @type sequence2:            str
-    @keyword sub_matrix:        The substitution matrix to use to determine 
the penalties.
-    @type sub_matrix:           numpy rank-2 int16 array
-    @keyword sub_seq:           The one letter code sequence corresponding 
to the substitution matrix indices.
-    @type sub_seq:              str
-    @keyword gap_penalty:       The penalty for introducing gaps, as a 
positive number.
-    @type gap_penalty:          float
-    @return:                    The Needleman-Wunsch matrix and traceback 
matrix.
-    @rtype:                     numpy rank-2 float32 array, numpy rank-2 
int16 array
+def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, 
sub_seq=None, gap_open_penalty=SCORE_GAP_PENALTY, gap_extend_penalty=1.0, 
end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, epsilon=1e-7):
+    """Construct the Needleman-Wunsch matrix for the given two sequences 
using the EMBOSS logic.
+
+    The algorithm has been modified to match that of 
U{EMBOSS<http://emboss.sourceforge.net/>} to allow for gap opening and 
extension penalties, as well as end penalties.
+
+
+    @param sequence1:                   The first sequence.
+    @type sequence1:                    str
+    @param sequence2:                   The second sequence.
+    @type sequence2:                    str
+    @keyword sub_matrix:                The substitution matrix to use to 
determine the penalties.
+    @type sub_matrix:                   numpy rank-2 int16 array
+    @keyword sub_seq:                   The one letter code sequence 
corresponding to the substitution matrix indices.
+    @type sub_seq:                      str
+    @keyword gap_open_penalty:          The penalty for introducing gaps, as 
a positive number.
+    @type gap_open_penalty:             float
+    @keyword gap_extend_penalty:        The penalty for extending a gap, as 
a positive number.
+    @type gap_extend_penalty:           float
+    @keyword end_gap_open_penalty:      The optional penalty for opening a 
gap at the end of a sequence.
+    @type end_gap_open_penalty:         float
+    @keyword end_gap_extend_penalty:    The optional penalty for extending a 
gap at the end of a sequence.
+    @type end_gap_extend_penalty:       float
+    @keyword epsilon:                   A value close to zero to determine 
if two numbers are the same, within this precision.
+    @type epsilon:                      float
+    @return:                            The Needleman-Wunsch matrix and 
traceback matrix.
+    @rtype:                             numpy rank-2 float32 array, numpy 
rank-2 int16 array
     """
 
     # Initial scoring matrix.
-    matrix = zeros((len(sequence1)+1, len(sequence2)+1), float32)
-    for i in range(1, len(matrix)):
-        matrix[i, 0] = -gap_penalty*i
-    for j in range(1, len(matrix[0])):
-        matrix[0, j] = -gap_penalty*j
+    M = len(sequence1)
+    N = len(sequence2)
+    matrix = zeros((M, N), float32)
+
+    # Initial gap matrices.
+    gap_matrix_vert = zeros((M, N), float32)
+    gap_matrix_hori = zeros((M, N), float32)
 
     # Initial traceback matrix.
-    traceback_matrix = zeros((len(sequence1), len(sequence2)), int16)
-
-    # Calculate the scores.
-    for i in range(1, len(matrix)):
-        for j in range(1, len(matrix[0])):
-            # Substitution scores when no matrix is provided.
-            if sub_matrix == None:
-                sub_score = SCORE_MISMATCH
-                if sequence1[i-1] == sequence2[j-1]:
-                    sub_score = SCORE_MATCH
-
+    traceback_matrix = zeros((M, N), int16)
+
+    # Set up position [0, 0].
+    matrix[0, 0] = sub_matrix[sub_seq.index(sequence1[0]), 
sub_seq.index(sequence2[0])]
+    gap_matrix_vert[0, 0] = -gap_open_penalty
+    gap_matrix_hori[0, 0] = -gap_open_penalty
+
+    # Set up the first column.
+    for i in range(1, M):
+        # Substitution scores from the matrix.
+        matrix[i, 0] = sub_matrix[sub_seq.index(sequence1[i]), 
sub_seq.index(sequence2[0])]
+
+        # Gap scores.
+        score_gap_open = matrix[i-1, 0] - gap_open_penalty
+        score_gap_extend = gap_matrix_hori[i-1, 0] - gap_extend_penalty
+
+        # Update the vertical gap matrix.
+        if i < M-1:
+            gap_matrix_vert[i, 0] = -gap_open_penalty
+
+        # Update the horizontal gap matrix.
+        gap_matrix_hori[i, 0] = max(score_gap_open, score_gap_extend)
+
+    # Set up the first row.
+    for j in range(1, N):
+        # Substitution scores from the matrix.
+        matrix[0, j] = sub_matrix[sub_seq.index(sequence1[0]), 
sub_seq.index(sequence2[j])]
+
+        # Gap scores.
+        score_gap_open = matrix[0, j-1] - gap_open_penalty
+        score_gap_extend = gap_matrix_vert[0, j-1] - gap_extend_penalty
+
+        # Update the vertical gap matrix.
+        gap_matrix_vert[0, j] = max(score_gap_open, score_gap_extend)
+
+        # Update the horizontal gap matrix.
+        if j < N-1:
+            gap_matrix_hori[0, j] = -gap_open_penalty
+
+    # Fill in the rest of the matrix.
+    for j in range(1, N):
+        for i in range(1, M):
             # Substitution scores from the matrix.
+            sub_score = sub_matrix[sub_seq.index(sequence1[i]), 
sub_seq.index(sequence2[j])]
+
+            # The diagonal score.
+            SCORES[0] = matrix[i-1, j-1]
+
+            # The top score.
+            SCORES[1] = gap_matrix_vert[i-1, j-1]
+
+            # The left score.
+            SCORES[2] = gap_matrix_hori[i-1, j-1]
+
+            # Store the score.
+            matrix[i, j] = SCORES.max() + sub_score
+
+            # Horizontal gap scores.
+            if j == N-1:
+                score_gap_open = matrix[i-1, j] - end_gap_open_penalty
+                score_gap_extend = gap_matrix_hori[i-1, j] - 
end_gap_extend_penalty
             else:
-                sub_score = sub_matrix[sub_seq.index(sequence1[i-1]), 
sub_seq.index(sequence2[j-1])]
-
-            # The diagonal score.
-            SCORES[0] = matrix[i-1, j-1] + sub_score
-
-            # The top score.
-            SCORES[1] = matrix[i-1, j] - gap_penalty
-
-            # The left score.
-            SCORES[2] = matrix[i, j-1] - gap_penalty
-
-            # Store the best score.
-            matrix[i, j] = SCORES.max()
-
-            # Update the traceback matrix.
-            traceback_matrix[i-1, j-1] = SCORES.argmax()
-
-    # Return the matrix.
+                score_gap_open = max(matrix[i-1, j], gap_matrix_vert[i-1, 
j]) - gap_open_penalty
+                score_gap_extend = gap_matrix_hori[i-1, j] - 
gap_extend_penalty
+            gap_matrix_hori[i, j] = max(score_gap_open, score_gap_extend)
+
+            # Vertical gap scores.
+            if i == M-1:
+                score_gap_open = matrix[i, j-1] - end_gap_open_penalty
+                score_gap_extend = gap_matrix_vert[i, j-1] - 
end_gap_extend_penalty
+            else:
+                score_gap_open = max(matrix[i, j-1], gap_matrix_hori[i, 
j-1]) - gap_open_penalty
+                score_gap_extend = gap_matrix_vert[i, j-1] - 
gap_extend_penalty
+            gap_matrix_vert[i, j] = max(score_gap_open, score_gap_extend)
+
+    # Determine the best traceback path.
+    j = N - 1
+    i = M - 1
+    last_move = 0
+    while j >= 0 and i >= 0:
+        # The current indices.
+        curr_i = i
+        curr_j = j
+
+        # Choose the correct gap extension penalties.
+        left_gap_extend_penalty = gap_extend_penalty
+        top_gap_extend_penalty = gap_extend_penalty
+        if i == 0 or i == M-1:
+            left_gap_extend_penalty = end_gap_extend_penalty
+        if j == 0 or j == N-1:
+            top_gap_extend_penalty = end_gap_extend_penalty
+
+        # Extending the gap to the left.
+        if last_move == TRACEBACK_LEFT and abs(left_gap_extend_penalty - 
(gap_matrix_vert[i, j] - gap_matrix_vert[i, j+1])) < epsilon:
+            traceback_matrix[i, j] = TRACEBACK_LEFT
+            j -= 1
+
+        # Extending the gap to the top.
+        elif last_move== TRACEBACK_TOP and abs(top_gap_extend_penalty - 
(gap_matrix_hori[i, j] - gap_matrix_hori[i+1, j])) < epsilon:
+            traceback_matrix[i, j] = TRACEBACK_TOP
+            i -= 1
+
+        # Match.
+        elif matrix[i, j] >= gap_matrix_vert[i, j] and matrix[i, j] >= 
gap_matrix_hori[i, j]:
+            # Add another gap to the left.
+            if last_move == TRACEBACK_LEFT and abs(matrix[i, j] - 
gap_matrix_vert[i, j]) < epsilon:
+                traceback_matrix[i, j] = TRACEBACK_LEFT
+                j -= 1
+
+            # Add another gap to the top.
+            elif last_move == TRACEBACK_TOP and abs(matrix[i, j] - 
gap_matrix_hori[i, j]) < epsilon:
+                traceback_matrix[i, j] = TRACEBACK_TOP
+                i -= 1
+
+            # Take the match.
+            else:
+                traceback_matrix[i, j] = 0
+                i -= 1
+                j -= 1
+
+        # First gap to the left.
+        elif gap_matrix_vert[i, j] >= gap_matrix_hori[i, j] and j >= 0:
+            traceback_matrix[i, j] = TRACEBACK_LEFT
+            j -= 1
+
+        # First gap to the top.
+        elif i >= 0:
+            traceback_matrix[i, j] = TRACEBACK_TOP
+            i -= 1
+
+        # Store the last move.
+        last_move = traceback_matrix[curr_i, curr_j]
+
+    # Return the matrices.
     return matrix, traceback_matrix
r27265 - in /trunk/lib/sequence_alignment: align_protein.py needleman_wunsch.py

Header

Content

Related Messages