Author: bugman Date: Thu Jan 29 13:37:46 2015 New Revision: 27350 URL: http://svn.gna.org/viewcvs/relax?rev=27350&view=rev Log: Initial lib.sequence_alignment.msa.central_star() function. This was moved from lib.sequence_alignment.align_protein.align_multiple_from_pairwise(). Added: trunk/lib/sequence_alignment/msa.py - copied, changed from r27349, trunk/lib/sequence_alignment/align_protein.py Modified: trunk/lib/sequence_alignment/__init__.py trunk/lib/sequence_alignment/align_protein.py Modified: trunk/lib/sequence_alignment/__init__.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/__init__.py?rev=27350&r1=27349&r2=27350&view=diff ============================================================================== --- trunk/lib/sequence_alignment/__init__.py (original) +++ trunk/lib/sequence_alignment/__init__.py Thu Jan 29 13:37:46 2015 @@ -24,6 +24,7 @@ __all__ = [ 'align_protein', + 'msa', 'needleman_wunsch', 'substitution_matrices' ] Modified: trunk/lib/sequence_alignment/align_protein.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/align_protein.py?rev=27350&r1=27349&r2=27350&view=diff ============================================================================== --- trunk/lib/sequence_alignment/align_protein.py (original) +++ trunk/lib/sequence_alignment/align_protein.py Thu Jan 29 13:37:46 2015 @@ -23,106 +23,12 @@ """General sequence alignment functions.""" # Python module imports. -from numpy import int16, zeros import sys # relax module imports. from lib.errors import RelaxError from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align from lib.sequence_alignment.substitution_matrices import BLOSUM62, BLOSUM62_SEQ, PAM250, PAM250_SEQ - - -def align_multiple_from_pairwise(reference_sequence, sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): - """Align multiple protein sequences to one reference by fusing multiple pairwise alignments. - - @param reference_sequence: The first protein sequence as one letter codes to use as the reference. - @type reference_sequence: str - @param sequences: The list of remaining protein sequences as one letter codes. - @type sequences: list of str - @keyword algorithm: The pairwise sequence alignment algorithm to use. - @type algorithm: str - @keyword matrix: The substitution matrix to use. - @type matrix: str - @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. - @type gap_open_penalty: float - @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. - @type gap_extend_penalty: float - @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. - @type end_gap_open_penalty: float - @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. - @type end_gap_extend_penalty: float - @return: The two alignment strings and the gap matrix. - @rtype: str, str, numpy rank-2 int array - """ - - # The pairwise alignments. - N = len(sequences) - align1_list = [] - align2_list = [] - gap_list = [] - for i in range(N): - # Pairwise alignment. - align1, align2, gaps = align_pairwise(reference_sequence, sequences[i], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) - - # Store the results. - align1_list.append(align1) - align2_list.append(align2) - gap_list.append(gaps) - - # Convert all sequence strings to lists. - ref_list = list(reference_sequence) - seq_list = [] - for i in range(N): - seq_list.append(list(sequences[i])) - - # Loop over the alignment elements. - strings = [] - index = -1 - offsets = zeros(N, int16) - while 1: - # Increment the index. - index += 1 - print "\nIndex %i" % index - - # Termination condition. - term = True - for i in range(N): - if index + offsets[i] < len(gap_list[i][0]): - term = False - else: - print " At end in %i" % i - if term: - break - - # A gap in one of the references. - gap = False - for i in range(N): - if index + offsets[i] >= len(gap_list[i][0]) or gap_list[i][0, index]: - print " Gap in %i" % i - offsets[i] += 1 - gap = True - print " New offsets %s" % offsets - - # No reference gap. - if not gap: - print " No ref gap." - continue - - # Insert the gap into the reference list. - print " Inserting gap into ref list at %i" % index - ref_list.insert(index, '-') - - for i in range(N): - seq = ''.join(seq_list[i]) - strings.append(seq) - - ref = ''.join(ref_list) - - print ref_list - print seq_list - - # Return the results. - return [ref] + strings, gap_list def align_pairwise(sequence1, sequence2, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): Copied: trunk/lib/sequence_alignment/msa.py (from r27349, trunk/lib/sequence_alignment/align_protein.py) URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/msa.py?p2=trunk/lib/sequence_alignment/msa.py&p1=trunk/lib/sequence_alignment/align_protein.py&r1=27349&r2=27350&rev=27350&view=diff ============================================================================== --- trunk/lib/sequence_alignment/align_protein.py (original) +++ trunk/lib/sequence_alignment/msa.py Thu Jan 29 13:37:46 2015 @@ -20,7 +20,7 @@ ############################################################################### # Module docstring. -"""General sequence alignment functions.""" +"""Multiple sequence alignment (MSA) algorithms.""" # Python module imports. from numpy import int16, zeros @@ -32,12 +32,10 @@ from lib.sequence_alignment.substitution_matrices import BLOSUM62, BLOSUM62_SEQ, PAM250, PAM250_SEQ -def align_multiple_from_pairwise(reference_sequence, sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): +def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): """Align multiple protein sequences to one reference by fusing multiple pairwise alignments. - @param reference_sequence: The first protein sequence as one letter codes to use as the reference. - @type reference_sequence: str - @param sequences: The list of remaining protein sequences as one letter codes. + @param sequences: The list of residue sequences as one letter codes. @type sequences: list of str @keyword algorithm: The pairwise sequence alignment algorithm to use. @type algorithm: str @@ -123,73 +121,3 @@ # Return the results. return [ref] + strings, gap_list - - -def align_pairwise(sequence1, sequence2, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): - """Align two protein sequences. - - @param sequence1: The first protein sequence as one letter codes. - @type sequence1: str - @param sequence2: The second protein sequence as one letter codes. - @type sequence2: str - @keyword algorithm: The pairwise sequence alignment algorithm to use. - @type algorithm: str - @keyword matrix: The substitution matrix to use. - @type matrix: str - @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. - @type gap_open_penalty: float - @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. - @type gap_extend_penalty: float - @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. - @type end_gap_open_penalty: float - @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. - @type end_gap_extend_penalty: float - @return: The two alignment strings and the gap matrix. - @rtype: str, str, numpy rank-2 int array - """ - - # Checks. - allowed_algor = ['NW70'] - if algorithm not in allowed_algor: - raise RelaxError("The sequence alignment algorithm '%s' is unknown, it must be one of %s." % (algorithm, allowed_algor)) - allowed_matrices = ['BLOSUM62', 'PAM250'] - if matrix not in allowed_matrices: - raise RelaxError("The substitution matrix '%s' is unknown, it must be one of %s." % (matrix, allowed_matrices)) - - # Capitalise the sequences. - sequence1 = sequence1.upper() - sequence2 = sequence2.upper() - - # Initial printout. - sys.stdout.write("\nPairwise protein alignment.\n") - sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) - sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) - sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) - sys.stdout.write("\n%-30s %s\n" % ("Input sequence 1:", sequence1)) - sys.stdout.write("%-30s %s\n" % ("Input sequence 2:", sequence2)) - - # Select the substitution matrix. - if matrix == 'BLOSUM62': - sub_matrix = BLOSUM62 - sub_seq = BLOSUM62_SEQ - elif matrix == 'PAM250': - sub_matrix = PAM250 - sub_seq = PAM250_SEQ - - # The alignment. - if algorithm == 'NW70': - align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) - - # Final printout. - sys.stdout.write("\n%-30s %s\n" % ("Aligned sequence 1:", align1)) - sys.stdout.write("%-30s %s\n" % ("Aligned sequence 2:", align2)) - sys.stdout.write("%-30s " % "") - for i in range(len(align1)): - if align1[i] == align2[i]: - sys.stdout.write("*") - else: - sys.stdout.write(" ") - sys.stdout.write("\n\n") - - # Return the results. - return align1, align2, gaps