r27359 - /trunk/lib/structure/internal/coordinates.py -- January 29, 2015

Author: bugman
Date: Thu Jan 29 16:19:17 2015
New Revision: 27359

URL: http://svn.gna.org/viewcvs/relax?rev=27359&view=rev
Log:
The assemble_coord_array() function is now using the central star multiple 
sequence alignment.

This is the function from the lib.structure.internal.coordinates module used 
to assemble common
atomic coordinate information, used by the structure.align, 
structure.atomic_fluctuations,
structure.com, structure.displacement, structure.find_pivot, structure.mean, 
structure.rmsd,
structure.superimpose and structure.web_of_motion user functions.

The non-functional lib.structure.internal.coordinates.common_residues() 
function has been removed as
the lib.sequence_alignment.msa.central_star() function performs this 
functionality correctly.


Modified:
    trunk/lib/structure/internal/coordinates.py

Modified: trunk/lib/structure/internal/coordinates.py
URL: 
http://svn.gna.org/viewcvs/relax/trunk/lib/structure/internal/coordinates.py?rev=27359&r1=27358&r2=27359&view=diff
==============================================================================
--- trunk/lib/structure/internal/coordinates.py (original)
+++ trunk/lib/structure/internal/coordinates.py Thu Jan 29 16:19:17 2015
@@ -27,7 +27,7 @@
 
 # relax module imports.
 from lib.errors import RelaxFault
-from lib.sequence_alignment.align_protein import align_pairwise
+from lib.sequence_alignment.msa import central_star
 
 
 def assemble_coord_array(objects=None, object_names=None, molecules=None, 
models=None, atom_id=None, algorithm='NW70', matrix='BLOSUM62', 
gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, 
end_gap_extend_penalty=0.0, seq_info_flag=False):
@@ -160,21 +160,35 @@
     # The total number of molecules.
     num_mols = len(atom_names)
 
-    # Sequence alignment.
-    if algorithm == 'NW70':
-        print("\nPairwise sequence alignment to the first molecule:\n")
-        gap_matrices = []
-        for mol_index in range(1, num_mols):
-            print("Molecules 1-%i" % (mol_index+1))
-            align1, align2, gaps = align_pairwise(one_letter_codes[0], 
one_letter_codes[mol_index], algorithm=algorithm, matrix=matrix, 
gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, 
end_gap_open_penalty=end_gap_open_penalty, 
end_gap_extend_penalty=end_gap_extend_penalty)
-            gap_matrices.append(gaps)
-
-        # Determine the residues in common.
-        skip = common_residues(gap_matrices=gap_matrices, 
one_letter_codes=one_letter_codes)
-
-    # No alignment, so create an empty residue skipping data structure.
+    # Multiple sequence alignment.
+    if algorithm != None:
+        # Use the central star multiple alignment algorithm.
+        strings, gaps = central_star(one_letter_codes, algorithm=algorithm, 
matrix=matrix, gap_open_penalty=gap_open_penalty, 
gap_extend_penalty=gap_extend_penalty, 
end_gap_open_penalty=end_gap_open_penalty, 
end_gap_extend_penalty=end_gap_extend_penalty)
+
+        # Create the residue skipping data structure. 
+        skip = []
+        for mol_index in range(num_mols):
+            skip.append([])
+            for i in range(len(strings[0])):
+                # No residue in the current sequence.
+                if gaps[mol_index][i]:
+                    continue
+
+                # A gap in one of the other sequences.
+                gap = False
+                for mol_index2 in range(num_mols):
+                    if gaps[mol_index2][i]:
+                        gap = True
+
+                # Skip the residue.
+                if gap:
+                    skip[mol_index].append(1)
+                else:
+                    skip[mol_index].append(0)
+
+    # No alignment.
     else:
-        # Create
+        # Create an empty residue skipping data structure. 
         skip = []
         for mol_index in range(num_mols):
             skip.append([])
@@ -253,145 +267,6 @@
         return coord, ids
 
 
-def common_residues(gap_matrices=None, one_letter_codes=None, seq=False):
-    """Determine the common residues between all the pairwise alignments.
-
-    @keyword gap_matrices:      The list of gap matrices from the pairwise 
alignments.
-    @type gap_matrices:         list of numpy rank-2 arrays.
-    @keyword one_letter_codes:  The list of strings of one letter residue 
codes for each molecule.
-    @type one_letter_codes:     list of str
-    @keyword seq:               A flag which if True will cause the gapped 
sequence strings to be returned.
-    @type seq:                  bool
-    @return:                    The residue skipping data structure and the 
optional gapped sequence strings.  For the skipping structure, the first 
dimension corresponds to the molecule and the second to the residue.  A one 
means the residue should be skipped, whereas zero means keep the residue.
-    @rtype:                     list of list of int
-    """
-
-    # The number of molecules.
-    num_mols = len(gap_matrices) + 1
-
-    # Initialise the residue skipping structures.
-    skip = []
-    skip_counts = zeros(num_mols, int16)
-    res_counts = zeros(num_mols, int16)
-    for mol_index in range(num_mols):
-        res_counts[mol_index] = len(one_letter_codes[mol_index])
-        skip.append([])
-        for j in range(res_counts[mol_index]):
-            skip[mol_index].append(0)
-
-    # Update the residue skipping structures for the first molecule.
-    for mol_index in range(num_mols-1):
-        # Loop over the residues of alignment i.
-        seq_index = -1
-        for j in range(len(gap_matrices[mol_index][0])):
-            # Increment the sequence index.
-            if not gap_matrices[mol_index][0, j]:
-                seq_index += 1
-
-            # A gap in the second sequence, so skip the residue.
-            if gap_matrices[mol_index][1, j]:
-                skip[0][seq_index] = 1
-
-    # Initialise the gapped strings data structure for the first molecule.
-    gapped_strings = ['']
-    string_length = max(res_counts)
-    offsets = zeros((num_mols-1), int16)
-    prev_offsets = zeros((num_mols-1), int16)
-    for seq_index in range(res_counts[0]):
-        # Increment the offsets indices.
-        for mol_index in range(1, num_mols):
-            while gap_matrices[mol_index-1][0, 
seq_index+offsets[mol_index-1]]:
-                offsets[mol_index-1] += 1
-
-        # A gap.
-        for i in range(max(offsets - prev_offsets)):
-            gapped_strings[0] += "-"
-
-        # Missing in one of the other molecule.
-        missing = False
-        for mol_index in range(1, num_mols):
-            if gap_matrices[mol_index-1][1, seq_index+offsets[mol_index-1]]:
-                missing = True
-        if missing:
-            gapped_strings[0] += "-"
-
-        # Keep the residue.
-        else:
-            gapped_strings[0] += one_letter_codes[0][seq_index]
-
-        # Store the old offsets.
-        prev_offsets = offsets * 1
-
-    # Final padding.
-    for j in range(max(res_counts) - res_counts[0] - 1):
-        gapped_strings[0] += "-"
-
-    # Update the first molecule skip counts.
-    skip_counts[0] = sum(skip[0])
-
-    # Update the residue skipping structures for all other molecules.
-    for mol_index in range(1, num_mols):
-        # Loop over the residues of alignment mol_index.
-        seq1_index = -1
-        seq2_index = -1
-        gapped_strings.append('')
-        for j in range(len(gap_matrices[mol_index-1][0])):
-            # Increment the sequence indices.
-            if not gap_matrices[mol_index-1][0, j]:
-                seq1_index += 1
-            if not gap_matrices[mol_index-1][1, j]:
-                seq2_index += 1
-
-            # End condition for the first molecule.
-            if seq1_index >= res_counts[0]:
-                # Skip the rest of the second molecule.
-                for k in range(seq2_index, res_counts[mol_index]):
-                    skip[mol_index][k] = 1
-                    skip_counts[mol_index] += 1
-                    gapped_strings[mol_index] += "-"
-
-                # Terminate the loop.
-                break
-
-            # A gap in the first sequence, so skip the residue.
-            if gap_matrices[mol_index-1][0, j]:
-                skip[mol_index][seq2_index] = 1
-                skip_counts[mol_index] += 1
-                gapped_strings[mol_index] += "-"
-
-            # Already skipped in the first molecule.
-            elif skip[0][seq1_index] and not gap_matrices[mol_index-1][1, j]:
-                skip[mol_index][seq2_index] = 1
-                skip_counts[mol_index] += 1
-                gapped_strings[mol_index] += "-"
-
-            # Skipped in the second molecule.
-            elif gap_matrices[mol_index-1][1, j]:
-                gapped_strings[mol_index] += "-"
-
-            # Print out the one letter code.
-            else:
-                gapped_strings[mol_index] += 
one_letter_codes[mol_index][seq2_index]
-
-    # Printout.
-    print("Shared residues:")
-    for mol_index in range(num_mols):
-        print("Molecule %3i:  %s" % (mol_index, gapped_strings[mol_index]))
-
-    # Sanity checks.
-    total = res_counts[0] - skip_counts[0]
-    for mol_index in range(1, num_mols):
-        if total != res_counts[mol_index] - skip_counts[mol_index]:
-            print("\nThe total shared residue counts between molcule 1 and 
%i of %i and %i respectively do not match." % ((mol_index+1), total, 
res_counts[mol_index] - skip_counts[mol_index]))
-            raise RelaxFault
-
-    # Return the skipping data structure.
-    if seq:
-        return skip, gapped_strings
-    else:
-        return skip
-
-
 def loop_coord_structures(objects=None, molecules=None, models=None, 
atom_id=None):
     """Generator function for looping over all internal structural objects, 
models and molecules.
r27359 - /trunk/lib/structure/internal/coordinates.py

Header

Content

Related Messages