mailr27252 - in /trunk/lib/sequence_alignment: __init__.py needleman_wunsch.py


Others Months | Index by Date | Thread Index
>>   [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Header


Content

Posted by edward on January 21, 2015 - 11:37:
Author: bugman
Date: Wed Jan 21 11:37:37 2015
New Revision: 27252

URL: http://svn.gna.org/viewcvs/relax?rev=27252&view=rev
Log:
Implementation of the Needleman-Wunsch sequence alignment algorithm.

This is located in the lib.sequence_alignment.needleman_wunsch module.  This 
is implemented as
described in the Wikipedia article 
https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm.


Added:
    trunk/lib/sequence_alignment/needleman_wunsch.py
Modified:
    trunk/lib/sequence_alignment/__init__.py

Modified: trunk/lib/sequence_alignment/__init__.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/__init__.py?rev=27252&r1=27251&r2=27252&view=diff
==============================================================================
--- trunk/lib/sequence_alignment/__init__.py    (original)
+++ trunk/lib/sequence_alignment/__init__.py    Wed Jan 21 11:37:37 2015
@@ -23,4 +23,5 @@
 """The relax-lib sequence alignment package - a library of functions for 
aligning proteins, DNA, RNA or other molecules."""
 
 __all__ = [
+    'needleman_wunsch'
 ]

Added: trunk/lib/sequence_alignment/needleman_wunsch.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/needleman_wunsch.py?rev=27252&view=auto
==============================================================================
--- trunk/lib/sequence_alignment/needleman_wunsch.py    (added)
+++ trunk/lib/sequence_alignment/needleman_wunsch.py    Wed Jan 21 11:37:37 
2015
@@ -0,0 +1,147 @@
+###############################################################################
+#                                                                            
 #
+# Copyright (C) 2015 Edward d'Auvergne                                       
 #
+#                                                                            
 #
+# This file is part of the program relax (http://www.nmr-relax.com).         
 #
+#                                                                            
 #
+# This program is free software: you can redistribute it and/or modify       
 #
+# it under the terms of the GNU General Public License as published by       
 #
+# the Free Software Foundation, either version 3 of the License, or          
 #
+# (at your option) any later version.                                        
 #
+#                                                                            
 #
+# This program is distributed in the hope that it will be useful,            
 #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of             
 #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              
 #
+# GNU General Public License for more details.                               
 #
+#                                                                            
 #
+# You should have received a copy of the GNU General Public License          
 #
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.      
 #
+#                                                                            
 #
+###############################################################################
+
+# Module docstring.
+"""Functions for implementing the Needleman-Wunsch sequence alignment 
algorithm."""
+
+# Python module imports.
+from numpy import int16, zeros
+
+# Default scores.
+SCORE_MATCH = 1
+SCORE_MISMATCH = -1
+SCORE_GAP_PENALTY = -1
+SCORES = zeros(3, int16)
+
+# Indices.
+TRACEBACK_DIAG = 0
+TRACEBACK_TOP = 1
+TRACEBACK_LEFT = 2
+
+
+def needleman_wunsch_align(sequence1, sequence2):
+    """Align two sequences using the Needleman-Wunsch algorithm.
+
+    This is implemented as described in the U{Wikipedia article on the 
Needleman-Wunsch algorithm 
<https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm>}.
+
+
+    @param sequence1:   The first sequence.
+    @type sequence1:    str
+    @param sequence2:   The second sequence.
+    @type sequence2:    str
+    @return:            The two alignment strings and the gap matrix.
+    @rtype:             str, str, numpy rank-2 int array
+    """
+
+    # The sequence lengths.
+    M = len(sequence1)
+    N = len(sequence2)
+
+    # Calculate the scoring and traceback matrices.
+    matrix, traceback_matrix = needleman_wunsch_matrix(sequence1, sequence2)
+
+    # Generate the alignment.
+    i = M - 1
+    j = N - 1
+    alignment1 = ""
+    alignment2 = ""
+    while i >= 0 or j >= 0:
+        # Diagonal.
+        if traceback_matrix[i, j] == TRACEBACK_DIAG:
+            alignment1 += sequence1[i]
+            alignment2 += sequence2[j]
+            i -= 1
+            j -= 1
+
+        # Top.
+        elif traceback_matrix[i, j] == TRACEBACK_TOP:
+            alignment1 += sequence1[i]
+            alignment2 += '-'
+            i -= 1
+
+        # Left.
+        elif traceback_matrix[i, j] == TRACEBACK_LEFT:
+            alignment1 += '-'
+            alignment2 += sequence2[j]
+            j -= 1
+
+    # Reverse the alignments.
+    align1 = alignment1[::-1]
+    align2 = alignment2[::-1]
+
+    # Gap representation.
+    gaps = zeros((2, len(align1)), int16)
+    for l in range(len(align1)):
+        if align1[l] == '-':
+            gaps[0, l] = 1
+        if align2[l] == '-':
+            gaps[1, l] = 1
+
+    # Return the alignments and gap matrix.
+    return align1, align2, gaps
+
+
+def needleman_wunsch_matrix(sequence1, sequence2):
+    """Construct the Needleman-Wunsch matrix for the given two sequences.
+
+    @param sequence1:   The first sequence.
+    @type sequence1:    str
+    @param sequence2:   The second sequence.
+    @type sequence2:    str
+    @return:            The Needleman-Wunsch matrix and traceback matrix.
+    @rtype:             numpy rank-2 int array, numpy rank-2 int array
+    """
+
+    # Initial scoring matrix.
+    matrix = zeros((len(sequence1)+1, len(sequence2)+1), int16)
+    for i in range(len(matrix)):
+        matrix[i, 0] = -i
+    for j in range(len(matrix[0])):
+        matrix[0, j] = -j
+
+    # Initial traceback matrix.
+    traceback_matrix = zeros((len(sequence1), len(sequence2)), int16)
+
+    # Calculate the scores.
+    for i in range(1, len(matrix)):
+        for j in range(1, len(matrix[0])):
+            # Substitution scores.
+            sub_score = SCORE_MISMATCH
+            if sequence1[i-1] == sequence2[j-1]:
+                sub_score = SCORE_MATCH
+
+            # The diagonal score.
+            SCORES[0] = matrix[i-1, j-1] + sub_score
+
+            # The top score.
+            SCORES[1] = matrix[i-1, j] + SCORE_GAP_PENALTY
+
+            # The left score.
+            SCORES[2] = matrix[i, j-1] + SCORE_GAP_PENALTY
+
+            # Store the best score.
+            matrix[i, j] = SCORES.max()
+
+            # Update the traceback matrix.
+            traceback_matrix[i-1, j-1] = SCORES.argmax()
+
+    # Return the matrix.
+    return matrix, traceback_matrix




Related Messages


Powered by MHonArc, Updated Wed Jan 21 11:40:02 2015