r27356 - /trunk/lib/sequence_alignment/msa.py -- January 29, 2015

Author: bugman
Date: Thu Jan 29 15:48:34 2015
New Revision: 27356

URL: http://svn.gna.org/viewcvs/relax?rev=27356&view=rev
Log:
Complete implementation of the central star multiple sequence alignment 
algorithm.

This includes all the four major steps - pairwise alignment between all 
sequence pairs, finding the
central sequence, iteratively aligning the sequences to the gapped central 
sequence, and introducing
gaps in previous alignments during the iterative alignment.

The correctness of the implementation is verified by the 
Test_msa.test_central_star unit test of the
_lib._sequence_alignment.test_msa module.


Modified:
    trunk/lib/sequence_alignment/msa.py

Modified: trunk/lib/sequence_alignment/msa.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/sequence_alignment/msa.py?rev=27356&r1=27355&r2=27356&view=diff
==============================================================================
--- trunk/lib/sequence_alignment/msa.py (original)
+++ trunk/lib/sequence_alignment/msa.py Thu Jan 29 15:48:34 2015
@@ -23,13 +23,11 @@
 """Multiple sequence alignment (MSA) algorithms."""
 
 # Python module imports.
-from numpy import int16, zeros
+from numpy import float64, int16, zeros
 import sys
 
 # relax module imports.
-from lib.errors import RelaxError
-from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align
-from lib.sequence_alignment.substitution_matrices import BLOSUM62, 
BLOSUM62_SEQ, PAM250, PAM250_SEQ
+from lib.sequence_alignment.align_protein import align_pairwise
 
 
 def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', 
gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, 
end_gap_extend_penalty=0.0):
@@ -49,75 +47,114 @@
     @type end_gap_open_penalty:         float
     @keyword end_gap_extend_penalty:    The optional penalty for extending a 
gap at the end of a sequence.
     @type end_gap_extend_penalty:       float
-    @return:                            The two alignment strings and the 
gap matrix.
-    @rtype:                             str, str, numpy rank-2 int array
+    @return:                            The list of alignment strings and 
the gap matrix.
+    @rtype:                             list of str, numpy rank-2 int array
     """
 
-    # The pairwise alignments.
+    # Initialise.
     N = len(sequences)
-    align1_list = []
-    align2_list = []
-    gap_list = []
+    scores = zeros((N, N), float64)
+
+    # Set up lists of lists for storing all alignment strings.
+    align1_matrix = []
+    align2_matrix = []
     for i in range(N):
-        # Pairwise alignment.
-        align1, align2, gaps = align_pairwise(reference_sequence, 
sequences[i], algorithm=algorithm, matrix=matrix, 
gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, 
end_gap_open_penalty=end_gap_open_penalty, 
end_gap_extend_penalty=end_gap_extend_penalty)
+        align1_matrix.append([])
+        align2_matrix.append([])
+        for j in range(N):
+            if i == j:
+                align1_matrix[i].append(sequences[i])
+                align2_matrix[i].append(sequences[i])
+            else:
+                align1_matrix[i].append(None)
+                align2_matrix[i].append(None)
 
-        # Store the results.
-        align1_list.append(align1)
-        align2_list.append(align2)
-        gap_list.append(gaps)
+    # Printout.
+    sys.stdout.write("Central Star multiple sequence alignment.\n\n")
+    sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm))
+    sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix))
+    sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", 
gap_open_penalty))
+    sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", 
gap_extend_penalty))
+    sys.stdout.write("Initial sequences:\n")
+    for i in range(N):
+        sys.stdout.write("%3i %s\n" % (i+1, sequences[i]))
 
-    # Convert all sequence strings to lists.
-    ref_list = list(reference_sequence)
-    seq_list = []
+    # All pairwise alignments.
+    sys.stdout.write("\nDetermining the scores for all pairwise 
alignments:\n")
     for i in range(N):
-        seq_list.append(list(sequences[i]))
+        for j in range(i+1, N):
+            # Align the pair.
+            sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i+1, j+1)))
+            score, align1, align2, gaps = align_pairwise(sequences[i], 
sequences[j], algorithm=algorithm, matrix=matrix, 
gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, 
end_gap_open_penalty=end_gap_open_penalty, 
end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0)
+            sys.stdout.write("%10.1f\n" % score)
 
-    # Loop over the alignment elements.
+            # Store the score and alignment strings.
+            scores[i, j] = scores[j, i] = score
+            align1_matrix[i][j] = align1_matrix[j][i] = align1
+            align2_matrix[i][j] = align2_matrix[j][i] = align2
+
+    # The central sequence.
+    sys.stdout.write("\nDetermining the central sequence:\n")
+    sum_scores = scores.sum(0)
+    Sc_sum_score = 1e100
+    Sc_index = 0
+    for i in range(N):
+        if sum_scores[i] < Sc_sum_score:
+            Sc_sum_score = sum_scores[i]
+            Sc_index = i
+        sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" 
% (i+1)), sum_scores[i]))
+    sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index+1))
+
+    # Partition the sequences.
+    Sc = sequences[Sc_index]
+    Si = []
+    for i in range(N):
+        if i != Sc_index:
+            Si.append(sequences[i])
+
+    # Optimal alignments.
+    sys.stdout.write("\nDetermining the iterative optimal alignments:\n")
+    Sc_prime = Sc
+    string_lists = []
+    for i in range(N-1):
+        # Update the string lists.
+        string_lists.append([])
+
+        # Perform the pairwise alignment between Sc' and Si, replacing all 
'-' with 'X'.
+        score, Sc_prime, Si_prime, gaps = 
align_pairwise(Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), 
algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, 
gap_extend_penalty=gap_extend_penalty, 
end_gap_open_penalty=end_gap_open_penalty, 
end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0)
+        sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", 
Sc_prime.replace('X', '-')))
+        sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i+1)), 
Si_prime.replace('X', '-')))
+
+        # Store the Si alignment.
+        for j in range(len(Sc_prime)):
+            string_lists[i].append(Si_prime[j])
+
+        # Add spaces to the lists for all previous alignments.
+        else:
+            # Find gaps in the central sequence.
+            for j in range(len(Sc_prime)):
+                if Sc_prime[j] == '-':
+                    # Pad the previous alignments.
+                    for k in range(0, i):
+                        string_lists[k].insert(j, '-')
+
+    # Rebuild the alignment lists and create a gap matrix.
     strings = []
-    index = -1
-    offsets = zeros(N, int16)
-    while 1:
-        # Increment the index.
-        index += 1
-        print "\nIndex %i" % index
+    M = len(Sc_prime)
+    gaps = zeros((N, M), int16)
+    strings.append(Sc_prime)
+    for i in range(N-1):
+        strings.append(''.join(string_lists[i]))
+    for i in range(N):
+        strings[i] = strings[i].replace('X', '-')
+        for j in range(M):
+            if strings[i][j] == '-':
+                gaps[i, j] = 1
 
-        # Termination condition.
-        term = True
-        for i in range(N):
-            if index + offsets[i] < len(gap_list[i][0]):
-                term = False
-            else:
-                print "    At end in %i" % i
-        if term:
-            break
-
-        # A gap in one of the references.
-        gap = False
-        for i in range(N):
-            if index + offsets[i] >= len(gap_list[i][0]) or gap_list[i][0, 
index]:
-                print "    Gap in %i" % i
-                offsets[i] += 1
-                gap = True
-        print "    New offsets %s" % offsets
-
-        # No reference gap.
-        if not gap:
-            print "    No ref gap."
-            continue
-
-        # Insert the gap into the reference list.
-        print "    Inserting gap into ref list at %i" % index
-        ref_list.insert(index, '-')
-
+    # Final printout.
+    sys.stdout.write("\nFinal MSA:\n")
     for i in range(N):
-        seq = ''.join(seq_list[i])
-        strings.append(seq)
-
-    ref = ''.join(ref_list)
-
-    print ref_list
-    print seq_list
+        sys.stdout.write("%3i %s\n" % (i+1, strings[i]))
 
     # Return the results.
-    return [ref] + strings, gap_list
+    return strings, gaps
r27356 - /trunk/lib/sequence_alignment/msa.py

Header

Content

Related Messages