Author: bugman Date: Thu Jan 29 16:19:17 2015 New Revision: 27359 URL: http://svn.gna.org/viewcvs/relax?rev=27359&view=rev Log: The assemble_coord_array() function is now using the central star multiple sequence alignment. This is the function from the lib.structure.internal.coordinates module used to assemble common atomic coordinate information, used by the structure.align, structure.atomic_fluctuations, structure.com, structure.displacement, structure.find_pivot, structure.mean, structure.rmsd, structure.superimpose and structure.web_of_motion user functions. The non-functional lib.structure.internal.coordinates.common_residues() function has been removed as the lib.sequence_alignment.msa.central_star() function performs this functionality correctly. Modified: trunk/lib/structure/internal/coordinates.py Modified: trunk/lib/structure/internal/coordinates.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/structure/internal/coordinates.py?rev=27359&r1=27358&r2=27359&view=diff ============================================================================== --- trunk/lib/structure/internal/coordinates.py (original) +++ trunk/lib/structure/internal/coordinates.py Thu Jan 29 16:19:17 2015 @@ -27,7 +27,7 @@ # relax module imports. from lib.errors import RelaxFault -from lib.sequence_alignment.align_protein import align_pairwise +from lib.sequence_alignment.msa import central_star def assemble_coord_array(objects=None, object_names=None, molecules=None, models=None, atom_id=None, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, seq_info_flag=False): @@ -160,21 +160,35 @@ # The total number of molecules. num_mols = len(atom_names) - # Sequence alignment. - if algorithm == 'NW70': - print("\nPairwise sequence alignment to the first molecule:\n") - gap_matrices = [] - for mol_index in range(1, num_mols): - print("Molecules 1-%i" % (mol_index+1)) - align1, align2, gaps = align_pairwise(one_letter_codes[0], one_letter_codes[mol_index], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) - gap_matrices.append(gaps) - - # Determine the residues in common. - skip = common_residues(gap_matrices=gap_matrices, one_letter_codes=one_letter_codes) - - # No alignment, so create an empty residue skipping data structure. + # Multiple sequence alignment. + if algorithm != None: + # Use the central star multiple alignment algorithm. + strings, gaps = central_star(one_letter_codes, algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) + + # Create the residue skipping data structure. + skip = [] + for mol_index in range(num_mols): + skip.append([]) + for i in range(len(strings[0])): + # No residue in the current sequence. + if gaps[mol_index][i]: + continue + + # A gap in one of the other sequences. + gap = False + for mol_index2 in range(num_mols): + if gaps[mol_index2][i]: + gap = True + + # Skip the residue. + if gap: + skip[mol_index].append(1) + else: + skip[mol_index].append(0) + + # No alignment. else: - # Create + # Create an empty residue skipping data structure. skip = [] for mol_index in range(num_mols): skip.append([]) @@ -253,145 +267,6 @@ return coord, ids -def common_residues(gap_matrices=None, one_letter_codes=None, seq=False): - """Determine the common residues between all the pairwise alignments. - - @keyword gap_matrices: The list of gap matrices from the pairwise alignments. - @type gap_matrices: list of numpy rank-2 arrays. - @keyword one_letter_codes: The list of strings of one letter residue codes for each molecule. - @type one_letter_codes: list of str - @keyword seq: A flag which if True will cause the gapped sequence strings to be returned. - @type seq: bool - @return: The residue skipping data structure and the optional gapped sequence strings. For the skipping structure, the first dimension corresponds to the molecule and the second to the residue. A one means the residue should be skipped, whereas zero means keep the residue. - @rtype: list of list of int - """ - - # The number of molecules. - num_mols = len(gap_matrices) + 1 - - # Initialise the residue skipping structures. - skip = [] - skip_counts = zeros(num_mols, int16) - res_counts = zeros(num_mols, int16) - for mol_index in range(num_mols): - res_counts[mol_index] = len(one_letter_codes[mol_index]) - skip.append([]) - for j in range(res_counts[mol_index]): - skip[mol_index].append(0) - - # Update the residue skipping structures for the first molecule. - for mol_index in range(num_mols-1): - # Loop over the residues of alignment i. - seq_index = -1 - for j in range(len(gap_matrices[mol_index][0])): - # Increment the sequence index. - if not gap_matrices[mol_index][0, j]: - seq_index += 1 - - # A gap in the second sequence, so skip the residue. - if gap_matrices[mol_index][1, j]: - skip[0][seq_index] = 1 - - # Initialise the gapped strings data structure for the first molecule. - gapped_strings = [''] - string_length = max(res_counts) - offsets = zeros((num_mols-1), int16) - prev_offsets = zeros((num_mols-1), int16) - for seq_index in range(res_counts[0]): - # Increment the offsets indices. - for mol_index in range(1, num_mols): - while gap_matrices[mol_index-1][0, seq_index+offsets[mol_index-1]]: - offsets[mol_index-1] += 1 - - # A gap. - for i in range(max(offsets - prev_offsets)): - gapped_strings[0] += "-" - - # Missing in one of the other molecule. - missing = False - for mol_index in range(1, num_mols): - if gap_matrices[mol_index-1][1, seq_index+offsets[mol_index-1]]: - missing = True - if missing: - gapped_strings[0] += "-" - - # Keep the residue. - else: - gapped_strings[0] += one_letter_codes[0][seq_index] - - # Store the old offsets. - prev_offsets = offsets * 1 - - # Final padding. - for j in range(max(res_counts) - res_counts[0] - 1): - gapped_strings[0] += "-" - - # Update the first molecule skip counts. - skip_counts[0] = sum(skip[0]) - - # Update the residue skipping structures for all other molecules. - for mol_index in range(1, num_mols): - # Loop over the residues of alignment mol_index. - seq1_index = -1 - seq2_index = -1 - gapped_strings.append('') - for j in range(len(gap_matrices[mol_index-1][0])): - # Increment the sequence indices. - if not gap_matrices[mol_index-1][0, j]: - seq1_index += 1 - if not gap_matrices[mol_index-1][1, j]: - seq2_index += 1 - - # End condition for the first molecule. - if seq1_index >= res_counts[0]: - # Skip the rest of the second molecule. - for k in range(seq2_index, res_counts[mol_index]): - skip[mol_index][k] = 1 - skip_counts[mol_index] += 1 - gapped_strings[mol_index] += "-" - - # Terminate the loop. - break - - # A gap in the first sequence, so skip the residue. - if gap_matrices[mol_index-1][0, j]: - skip[mol_index][seq2_index] = 1 - skip_counts[mol_index] += 1 - gapped_strings[mol_index] += "-" - - # Already skipped in the first molecule. - elif skip[0][seq1_index] and not gap_matrices[mol_index-1][1, j]: - skip[mol_index][seq2_index] = 1 - skip_counts[mol_index] += 1 - gapped_strings[mol_index] += "-" - - # Skipped in the second molecule. - elif gap_matrices[mol_index-1][1, j]: - gapped_strings[mol_index] += "-" - - # Print out the one letter code. - else: - gapped_strings[mol_index] += one_letter_codes[mol_index][seq2_index] - - # Printout. - print("Shared residues:") - for mol_index in range(num_mols): - print("Molecule %3i: %s" % (mol_index, gapped_strings[mol_index])) - - # Sanity checks. - total = res_counts[0] - skip_counts[0] - for mol_index in range(1, num_mols): - if total != res_counts[mol_index] - skip_counts[mol_index]: - print("\nThe total shared residue counts between molcule 1 and %i of %i and %i respectively do not match." % ((mol_index+1), total, res_counts[mol_index] - skip_counts[mol_index])) - raise RelaxFault - - # Return the skipping data structure. - if seq: - return skip, gapped_strings - else: - return skip - - def loop_coord_structures(objects=None, molecules=None, models=None, atom_id=None): """Generator function for looping over all internal structural objects, models and molecules.