Author: bugman Date: Wed Jan 21 15:40:56 2015 New Revision: 27258 URL: http://svn.gna.org/viewcvs/relax?rev=27258&view=rev Log: Created the lib.sequence_alignment.align_protein module for the sequence alignment of proteins. This general module currently implements the align_pairwise() function for the pairwise alignment of protein sequences. It provides the infrastructure for specifying gap starting and extension penalties, choosing the alignment algorithm (currently only the Needleman-Wunsch sequence alignment algorithm as 'NW70'), and choosing the substitution matrix (currently only BLOSUM62). The function provides lots of printouts for user feedback. Added: trunk/lib/sequence_alignment/align_protein.py Modified: trunk/lib/sequence_alignment/__init__.py Modified: trunk/lib/sequence_alignment/__init__.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/__init__.py?rev=27258&r1=27257&r2=27258&view=diff ============================================================================== --- trunk/lib/sequence_alignment/__init__.py (original) +++ trunk/lib/sequence_alignment/__init__.py Wed Jan 21 15:40:56 2015 @@ -23,6 +23,7 @@ """The relax-lib sequence alignment package - a library of functions for aligning proteins, DNA, RNA or other molecules.""" __all__ = [ + 'align_protein', 'needleman_wunsch', 'substitution_matrices' ] Added: trunk/lib/sequence_alignment/align_protein.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/align_protein.py?rev=27258&view=auto ============================================================================== --- trunk/lib/sequence_alignment/align_protein.py (added) +++ trunk/lib/sequence_alignment/align_protein.py Wed Jan 21 15:40:56 2015 @@ -0,0 +1,98 @@ +############################################################################### +# # +# Copyright (C) 2015 Edward d'Auvergne # +# # +# This file is part of the program relax (http://www.nmr-relax.com). # +# # +# This program is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program. If not, see <http://www.gnu.org/licenses/>. # +# # +############################################################################### + +# Module docstring. +"""General sequence alignment functions.""" + +# Python module imports. +from string import upper +import sys +from warnings import warn + +# relax module imports. +from lib.errors import RelaxError +from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align +from lib.sequence_alignment.substitution_matrices import BLOSUM62, BLOSUM62_SEQ +from lib.warnings import RelaxWarning + + +def align_pairwise(sequence1, sequence2, algorithm='NW70', matrix='BLOSUM62', gap_penalty=1.0, extend_penalty=1.0): + """Align two protein sequences. + + @param sequence1: The first protein sequence as one letter codes. + @type sequence1: str + @param sequence2: The second protein sequence as one letter codes. + @type sequence2: str + @keyword algorithm: The alignment algorithm to use. + @type algorithm: str + @keyword matrix: The substitution matrix to use. + @type matrix: str + @keyword gap_penalty: The penalty for introducing gaps, as a positive number. + @type gap_penalty: float + @keyword extend_penalty: The penalty for extending a gap, as a positive number. + @type extend_penalty: float + """ + + # Checks. + allowed_algor = ['NW70'] + if algorithm not in allowed_algor: + raise RelaxError("The sequence alignment algorithm '%s' is unknown, it must be one of %s." % (algorithm, allowed_algor)) + allowed_matrices = ['BLOSUM62'] + if matrix not in allowed_matrices: + raise RelaxError("The substitution matrix '%s' is unknown, it must be one of %s." % (matrix, allowed_matrices)) + + # Capitalise the sequences. + sequence1 = upper(sequence1) + sequence2 = upper(sequence2) + + # Turn off the extension penalty for algorithms not supporting it. + if extend_penalty != None and algorithm in ['NW70']: + warn(RelaxWarning("The gap extension penalty is not supported by the Needleman-Wunsch sequence alignment algorithm.")) + extend_penalty = None + + # Initial printout. + sys.stdout.write("\nPairwise protein alignment.\n") + sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) + sys.stdout.write("%-30s %s\n" % ("Gap penalty:", gap_penalty)) + sys.stdout.write("%-30s %s\n" % ("Extend penalty:", extend_penalty)) + sys.stdout.write("\n%-30s %s\n" % ("Input sequence 1:", sequence1)) + sys.stdout.write("%-30s %s\n" % ("Input sequence 2:", sequence2)) + + # Select the substitution matrix. + if matrix == 'BLOSUM62': + sub_matrix = BLOSUM62 + sub_seq = BLOSUM62_SEQ + + # The alignment. + if algorithm == 'NW70': + align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_penalty=gap_penalty) + + # Final printout. + sys.stdout.write("\n%-30s %s\n" % ("Aligned sequence 1:", align1)) + sys.stdout.write("%-30s %s\n" % ("Aligned sequence 2:", align2)) + sys.stdout.write("%-30s " % "") + for i in range(len(align1)): + if align1[i] == align2[i]: + sys.stdout.write("*") + else: + sys.stdout.write(" ") + sys.stdout.write("\n\n") +