Author: bugman Date: Thu Jan 29 15:48:34 2015 New Revision: 27356 URL: http://svn.gna.org/viewcvs/relax?rev=27356&view=rev Log: Complete implementation of the central star multiple sequence alignment algorithm. This includes all the four major steps - pairwise alignment between all sequence pairs, finding the central sequence, iteratively aligning the sequences to the gapped central sequence, and introducing gaps in previous alignments during the iterative alignment. The correctness of the implementation is verified by the Test_msa.test_central_star unit test of the _lib._sequence_alignment.test_msa module. Modified: trunk/lib/sequence_alignment/msa.py Modified: trunk/lib/sequence_alignment/msa.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/msa.py?rev=27356&r1=27355&r2=27356&view=diff ============================================================================== --- trunk/lib/sequence_alignment/msa.py (original) +++ trunk/lib/sequence_alignment/msa.py Thu Jan 29 15:48:34 2015 @@ -23,13 +23,11 @@ """Multiple sequence alignment (MSA) algorithms.""" # Python module imports. -from numpy import int16, zeros +from numpy import float64, int16, zeros import sys # relax module imports. -from lib.errors import RelaxError -from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align -from lib.sequence_alignment.substitution_matrices import BLOSUM62, BLOSUM62_SEQ, PAM250, PAM250_SEQ +from lib.sequence_alignment.align_protein import align_pairwise def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): @@ -49,75 +47,114 @@ @type end_gap_open_penalty: float @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. @type end_gap_extend_penalty: float - @return: The two alignment strings and the gap matrix. - @rtype: str, str, numpy rank-2 int array + @return: The list of alignment strings and the gap matrix. + @rtype: list of str, numpy rank-2 int array """ - # The pairwise alignments. + # Initialise. N = len(sequences) - align1_list = [] - align2_list = [] - gap_list = [] + scores = zeros((N, N), float64) + + # Set up lists of lists for storing all alignment strings. + align1_matrix = [] + align2_matrix = [] for i in range(N): - # Pairwise alignment. - align1, align2, gaps = align_pairwise(reference_sequence, sequences[i], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) + align1_matrix.append([]) + align2_matrix.append([]) + for j in range(N): + if i == j: + align1_matrix[i].append(sequences[i]) + align2_matrix[i].append(sequences[i]) + else: + align1_matrix[i].append(None) + align2_matrix[i].append(None) - # Store the results. - align1_list.append(align1) - align2_list.append(align2) - gap_list.append(gaps) + # Printout. + sys.stdout.write("Central Star multiple sequence alignment.\n\n") + sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm)) + sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) + sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) + sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) + sys.stdout.write("Initial sequences:\n") + for i in range(N): + sys.stdout.write("%3i %s\n" % (i+1, sequences[i])) - # Convert all sequence strings to lists. - ref_list = list(reference_sequence) - seq_list = [] + # All pairwise alignments. + sys.stdout.write("\nDetermining the scores for all pairwise alignments:\n") for i in range(N): - seq_list.append(list(sequences[i])) + for j in range(i+1, N): + # Align the pair. + sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i+1, j+1))) + score, align1, align2, gaps = align_pairwise(sequences[i], sequences[j], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) + sys.stdout.write("%10.1f\n" % score) - # Loop over the alignment elements. + # Store the score and alignment strings. + scores[i, j] = scores[j, i] = score + align1_matrix[i][j] = align1_matrix[j][i] = align1 + align2_matrix[i][j] = align2_matrix[j][i] = align2 + + # The central sequence. + sys.stdout.write("\nDetermining the central sequence:\n") + sum_scores = scores.sum(0) + Sc_sum_score = 1e100 + Sc_index = 0 + for i in range(N): + if sum_scores[i] < Sc_sum_score: + Sc_sum_score = sum_scores[i] + Sc_index = i + sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" % (i+1)), sum_scores[i])) + sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index+1)) + + # Partition the sequences. + Sc = sequences[Sc_index] + Si = [] + for i in range(N): + if i != Sc_index: + Si.append(sequences[i]) + + # Optimal alignments. + sys.stdout.write("\nDetermining the iterative optimal alignments:\n") + Sc_prime = Sc + string_lists = [] + for i in range(N-1): + # Update the string lists. + string_lists.append([]) + + # Perform the pairwise alignment between Sc' and Si, replacing all '-' with 'X'. + score, Sc_prime, Si_prime, gaps = align_pairwise(Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) + sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", Sc_prime.replace('X', '-'))) + sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i+1)), Si_prime.replace('X', '-'))) + + # Store the Si alignment. + for j in range(len(Sc_prime)): + string_lists[i].append(Si_prime[j]) + + # Add spaces to the lists for all previous alignments. + else: + # Find gaps in the central sequence. + for j in range(len(Sc_prime)): + if Sc_prime[j] == '-': + # Pad the previous alignments. + for k in range(0, i): + string_lists[k].insert(j, '-') + + # Rebuild the alignment lists and create a gap matrix. strings = [] - index = -1 - offsets = zeros(N, int16) - while 1: - # Increment the index. - index += 1 - print "\nIndex %i" % index + M = len(Sc_prime) + gaps = zeros((N, M), int16) + strings.append(Sc_prime) + for i in range(N-1): + strings.append(''.join(string_lists[i])) + for i in range(N): + strings[i] = strings[i].replace('X', '-') + for j in range(M): + if strings[i][j] == '-': + gaps[i, j] = 1 - # Termination condition. - term = True - for i in range(N): - if index + offsets[i] < len(gap_list[i][0]): - term = False - else: - print " At end in %i" % i - if term: - break - - # A gap in one of the references. - gap = False - for i in range(N): - if index + offsets[i] >= len(gap_list[i][0]) or gap_list[i][0, index]: - print " Gap in %i" % i - offsets[i] += 1 - gap = True - print " New offsets %s" % offsets - - # No reference gap. - if not gap: - print " No ref gap." - continue - - # Insert the gap into the reference list. - print " Inserting gap into ref list at %i" % index - ref_list.insert(index, '-') - + # Final printout. + sys.stdout.write("\nFinal MSA:\n") for i in range(N): - seq = ''.join(seq_list[i]) - strings.append(seq) - - ref = ''.join(ref_list) - - print ref_list - print seq_list + sys.stdout.write("%3i %s\n" % (i+1, strings[i])) # Return the results. - return [ref] + strings, gap_list + return strings, gaps