r27257 - /trunk/lib/sequence_alignment/needleman_wunsch.py -- January 21, 2015

Author: bugman
Date: Wed Jan 21 15:36:43 2015
New Revision: 27257

URL: http://svn.gna.org/viewcvs/relax?rev=27257&view=rev
Log:
Modification of the Needleman-Wunsch sequence alignment algorithm 
implementation.

This is in the lib.sequence_alignment.needleman_wunsch functions.  Scoring 
matrices are now
supported, as well as a user supplied non-integer gap penalty.  The algorithm 
for walking through
the traceback matrix has been fixed for a bug under certain conditions.


Modified:
    trunk/lib/sequence_alignment/needleman_wunsch.py

Modified: trunk/lib/sequence_alignment/needleman_wunsch.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/needleman_wunsch.py?rev=27257&r1=27256&r2=27257&view=diff
==============================================================================
--- trunk/lib/sequence_alignment/needleman_wunsch.py    (original)
+++ trunk/lib/sequence_alignment/needleman_wunsch.py    Wed Jan 21 15:36:43 
2015
@@ -23,7 +23,7 @@
 """Functions for implementing the Needleman-Wunsch sequence alignment 
algorithm."""
 
 # Python module imports.
-from numpy import int16, zeros
+from numpy import float32, int16, zeros
 
 # Default scores.
 SCORE_MATCH = 1
@@ -37,18 +37,24 @@
 TRACEBACK_LEFT = 2
 
 
-def needleman_wunsch_align(sequence1, sequence2):
+def needleman_wunsch_align(sequence1, sequence2, sub_matrix=None, 
sub_seq=None, gap_penalty=SCORE_GAP_PENALTY):
     """Align two sequences using the Needleman-Wunsch algorithm.
 
     This is implemented as described in the U{Wikipedia article on the 
Needleman-Wunsch algorithm 
<https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}.
 
 
-    @param sequence1:   The first sequence.
-    @type sequence1:    str
-    @param sequence2:   The second sequence.
-    @type sequence2:    str
-    @return:            The two alignment strings and the gap matrix.
-    @rtype:             str, str, numpy rank-2 int array
+    @param sequence1:           The first sequence.
+    @type sequence1:            str
+    @param sequence2:           The second sequence.
+    @type sequence2:            str
+    @keyword sub_matrix:        The substitution matrix to use to determine 
the penalties.
+    @type sub_matrix:           numpy rank-2 int array
+    @keyword sub_seq:           The one letter code sequence corresponding 
to the substitution matrix indices.
+    @type sub_seq:              str
+    @keyword gap_penalty:       The penalty for introducing gaps, as a 
positive number.
+    @type gap_penalty:          float
+    @return:                    The two alignment strings and the gap matrix.
+    @rtype:                     str, str, numpy rank-2 int array
     """
 
     # The sequence lengths.
@@ -56,14 +62,18 @@
     N = len(sequence2)
 
     # Calculate the scoring and traceback matrices.
-    matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2)
+    matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2, 
sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty)
 
     # Generate the alignment.
     i = M - 1
     j = N - 1
     alignment1 = ""
     alignment2 = ""
-    while i >= 0 or j >= 0:
+    while 1:
+        # Termination.
+        if i < 0 or j < 0:
+            break
+
         # Diagonal.
         if traceback_matrix[i, j] == TRACEBACK_DIAG:
             alignment1 += sequence1[i]
@@ -99,23 +109,29 @@
     return align1, align2, gaps
 
 
-def needleman_wunsch_matrix(sequence1, sequence2):
+def needleman_wunsch_matrix(sequence1, sequence2, sub_matrix=None, 
sub_seq=None, gap_penalty=SCORE_GAP_PENALTY):
     """Construct the Needleman-Wunsch matrix for the given two sequences.
 
-    @param sequence1:   The first sequence.
-    @type sequence1:    str
-    @param sequence2:   The second sequence.
-    @type sequence2:    str
-    @return:            The Needleman-Wunsch matrix and traceback matrix.
-    @rtype:             numpy rank-2 int array, numpy rank-2 int array
+    @param sequence1:           The first sequence.
+    @type sequence1:            str
+    @param sequence2:           The second sequence.
+    @type sequence2:            str
+    @keyword sub_matrix:        The substitution matrix to use to determine 
the penalties.
+    @type sub_matrix:           numpy rank-2 int16 array
+    @keyword sub_seq:           The one letter code sequence corresponding 
to the substitution matrix indices.
+    @type sub_seq:              str
+    @keyword gap_penalty:       The penalty for introducing gaps, as a 
positive number.
+    @type gap_penalty:          float
+    @return:                    The Needleman-Wunsch matrix and traceback 
matrix.
+    @rtype:                     numpy rank-2 float32 array, numpy rank-2 
int16 array
     """
 
     # Initial scoring matrix.
-    matrix = zeros((len(sequence1)+1, len(sequence2)+1), int16)
-    for i in range(len(matrix)):
-        matrix[i, 0] = -i
-    for j in range(len(matrix[0])):
-        matrix[0, j] = -j
+    matrix = zeros((len(sequence1)+1, len(sequence2)+1), float32)
+    for i in range(1, len(matrix)):
+        matrix[i, 0] = -gap_penalty*i
+    for j in range(1, len(matrix[0])):
+        matrix[0, j] = -gap_penalty*j
 
     # Initial traceback matrix.
     traceback_matrix = zeros((len(sequence1), len(sequence2)), int16)
@@ -123,19 +139,24 @@
     # Calculate the scores.
     for i in range(1, len(matrix)):
         for j in range(1, len(matrix[0])):
-            # Substitution scores.
-            sub_score = SCORE_MISMATCH
-            if sequence1[i-1] == sequence2[j-1]:
-                sub_score = SCORE_MATCH
+            # Substitution scores when no matrix is provided.
+            if sub_matrix == None:
+                sub_score = SCORE_MISMATCH
+                if sequence1[i-1] == sequence2[j-1]:
+                    sub_score = SCORE_MATCH
+
+            # Substitution scores from the matrix.
+            else:
+                sub_score = sub_matrix[sub_seq.index(sequence1[i-1]), 
sub_seq.index(sequence2[j-1])]
 
             # The diagonal score.
             SCORES[0] = matrix[i-1, j-1] + sub_score
 
             # The top score.
-            SCORES[1] = matrix[i-1, j] + SCORE_GAP_PENALTY
+            SCORES[1] = matrix[i-1, j] - gap_penalty
 
             # The left score.
-            SCORES[2] = matrix[i, j-1] + SCORE_GAP_PENALTY
+            SCORES[2] = matrix[i, j-1] - gap_penalty
 
             # Store the best score.
             matrix[i, j] = SCORES.max()
r27257 - /trunk/lib/sequence_alignment/needleman_wunsch.py

Header

Content

Related Messages