lib.sequence_alignment.msa

1 ############################################################################### 2 # # 3 # Copyright (C) 2015 Edward d'Auvergne # 4 # # 5 # This file is part of the program relax (http://www.nmr-relax.com). # 6 # # 7 # This program is free software: you can redistribute it and/or modify # 8 # it under the terms of the GNU General Public License as published by # 9 # the Free Software Foundation, either version 3 of the License, or # 10 # (at your option) any later version. # 11 # # 12 # This program is distributed in the hope that it will be useful, # 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15 # GNU General Public License for more details. # 16 # # 17 # You should have received a copy of the GNU General Public License # 18 # along with this program. If not, see <http://www.gnu.org/licenses/>. # 19 # # 20 ############################################################################### 21 22 # Module docstring. 23 """Multiple sequence alignment (MSA) algorithms.""" 24 25 # Python module imports. 26 from numpy import float64, int16, zeros 27 import sys 28 29 # relax module imports. 30 from lib.errors import RelaxError 31 from lib.sequence_alignment.align_protein import align_pairwise 32 33

34 -def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0):

35 """Align multiple protein sequences to one reference by fusing multiple pairwise alignments. 36 37 @param sequences: The list of residue sequences as one letter codes. 38 @type sequences: list of str 39 @keyword algorithm: The pairwise sequence alignment algorithm to use. 40 @type algorithm: str 41 @keyword matrix: The substitution matrix to use. 42 @type matrix: str 43 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. 44 @type gap_open_penalty: float 45 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. 46 @type gap_extend_penalty: float 47 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. 48 @type end_gap_open_penalty: float 49 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. 50 @type end_gap_extend_penalty: float 51 @return: The list of alignment strings and the gap matrix. 52 @rtype: list of str, numpy rank-2 int array 53 """ 54 55 # Initialise. 56 N = len(sequences) 57 scores = zeros((N, N), float64) 58 59 # Set up lists of lists for storing all alignment strings. 60 align1_matrix = [] 61 align2_matrix = [] 62 for i in range(N): 63 align1_matrix.append([]) 64 align2_matrix.append([]) 65 for j in range(N): 66 if i == j: 67 align1_matrix[i].append(sequences[i]) 68 align2_matrix[i].append(sequences[i]) 69 else: 70 align1_matrix[i].append(None) 71 align2_matrix[i].append(None) 72 73 # Printout. 74 sys.stdout.write("\nCentral Star multiple sequence alignment.\n\n") 75 sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm)) 76 sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) 77 sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) 78 sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) 79 sys.stdout.write("Initial sequences:\n") 80 for i in range(N): 81 sys.stdout.write("%3i %s\n" % (i+1, sequences[i])) 82 83 # All pairwise alignments. 84 sys.stdout.write("\nDetermining the scores for all pairwise alignments:\n") 85 for i in range(N): 86 for j in range(i+1, N): 87 # Align the pair. 88 sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i+1, j+1))) 89 score, align1, align2, gaps = align_pairwise(sequences[i], sequences[j], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) 90 sys.stdout.write("%10.1f\n" % score) 91 92 # Store the score and alignment strings. 93 scores[i, j] = scores[j, i] = score 94 align1_matrix[i][j] = align1_matrix[j][i] = align1 95 align2_matrix[i][j] = align2_matrix[j][i] = align2 96 97 # The central sequence. 98 sys.stdout.write("\nDetermining the central sequence:\n") 99 sum_scores = scores.sum(0) 100 Sc_sum_score = 1e100 101 Sc_index = 0 102 for i in range(N): 103 if sum_scores[i] < Sc_sum_score: 104 Sc_sum_score = sum_scores[i] 105 Sc_index = i 106 sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" % (i+1)), sum_scores[i])) 107 sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index+1)) 108 109 # Partition the sequences. 110 Sc = sequences[Sc_index] 111 Si = [] 112 for i in range(N): 113 if i != Sc_index: 114 Si.append(sequences[i]) 115 116 # Optimal alignments. 117 sys.stdout.write("\nDetermining the iterative optimal alignments:\n") 118 Sc_prime = Sc 119 string_lists = [] 120 for i in range(N-1): 121 # Update the string lists. 122 string_lists.append([]) 123 124 # Perform the pairwise alignment between Sc' and Si, replacing all '-' with 'X'. 125 score, Sc_prime, Si_prime, gaps = align_pairwise(Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) 126 sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", Sc_prime.replace('X', '-'))) 127 sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i+1)), Si_prime.replace('X', '-'))) 128 129 # Store the Si alignment. 130 for j in range(len(Sc_prime)): 131 string_lists[i].append(Si_prime[j]) 132 133 # Add spaces to the lists for all previous alignments. 134 else: 135 # Find gaps in the central sequence. 136 for j in range(len(Sc_prime)): 137 if Sc_prime[j] == '-': 138 # Pad the previous alignments. 139 for k in range(0, i): 140 string_lists[k].insert(j, '-') 141 142 # Rebuild the alignment lists and create a gap matrix. 143 strings = [] 144 M = len(Sc_prime) 145 strings.append(Sc_prime) 146 for i in range(N-1): 147 strings.append(''.join(string_lists[i])) 148 for i in range(N): 149 strings[i] = strings[i].replace('X', '-') 150 151 # Restore the original sequence ordering. 152 string = strings.pop(0) 153 strings.insert(Sc_index, string) 154 155 # Create the gap matrix. 156 gaps = zeros((N, M), int16) 157 for i in range(N): 158 for j in range(M): 159 if strings[i][j] == '-': 160 gaps[i, j] = 1 161 162 # Final printout. 163 sys.stdout.write("\nFinal MSA:\n") 164 for i in range(N): 165 sys.stdout.write("%3i %s\n" % (i+1, strings[i])) 166 167 # Return the results. 168 return strings, gaps

169 170

171 -def msa_general(sequences, residue_numbers=None, msa_algorithm='Central Star', pairwise_algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0):

172 """General interface for multiple sequence alignments (MSA). 173 174 This can be used to select between the following MSA algorithms: 175 176 - 'Central Star', to use the central_star() function. 177 - 'residue number', to use the msa_residue_numbers() function. 178 179 180 @param sequences: The list of residue sequences as one letter codes. 181 @type sequences: list of str 182 @keyword residue_numbers: The list of residue numbers for each sequence. 183 @type residue_numbers: list of list of int 184 @keyword msa_algorithm: The multiple sequence alignment (MSA) algorithm to use. 185 @type msa_algorithm: str 186 @keyword pairwise_algorithm: The pairwise sequence alignment algorithm to use. 187 @type pairwise_algorithm: str 188 @keyword matrix: The substitution matrix to use. 189 @type matrix: str 190 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number. 191 @type gap_open_penalty: float 192 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number. 193 @type gap_extend_penalty: float 194 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence. 195 @type end_gap_open_penalty: float 196 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence. 197 @type end_gap_extend_penalty: float 198 @return: The list of alignment strings and the gap matrix. 199 @rtype: list of str, numpy rank-2 int array 200 """ 201 202 # Check the MSA algorithm. 203 allowed_msa = ['Central Star', 'residue number'] 204 if msa_algorithm not in allowed_msa: 205 raise RelaxError("The MSA algorithm must be one of %s." % allowed_msa) 206 207 # Check the penalty arguments. 208 if gap_open_penalty != None: 209 if gap_open_penalty < 0.0: 210 raise RelaxError("The gap opening penalty %s must be a positive number." % gap_open_penalty) 211 if gap_extend_penalty != None: 212 if gap_extend_penalty < 0.0: 213 raise RelaxError("The gap extension penalty %s must be a positive number." % gap_extend_penalty) 214 if end_gap_open_penalty != None: 215 if end_gap_open_penalty < 0.0: 216 raise RelaxError("The end gap opening penalty %s must be a positive number." % end_gap_open_penalty) 217 if end_gap_extend_penalty != None: 218 if end_gap_extend_penalty < 0.0: 219 raise RelaxError("The end gap extension penalty %s must be a positive number." % end_gap_extend_penalty) 220 221 # Use the central star multiple alignment algorithm. 222 if msa_algorithm == 'Central Star': 223 strings, gaps = central_star(sequences, algorithm=pairwise_algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) 224 225 # Alignment by residue number. 226 elif msa_algorithm == 'residue number': 227 strings, gaps = msa_residue_numbers(sequences, residue_numbers=residue_numbers) 228 229 # Return the alignment strings and gap matrix. 230 return strings, gaps

231 232

233 -def msa_residue_numbers(sequences, residue_numbers=None):

234 """Align multiple sequences based on the residue numbering. 235 236 @param sequences: The list of residue sequences as one letter codes. 237 @type sequences: list of str 238 @keyword residue_numbers: The list of residue numbers for each sequence. 239 @type residue_numbers: list of list of int 240 @return: The list of alignment strings and the gap matrix. 241 @rtype: list of str, numpy rank-2 int array 242 """ 243 244 # Initialise. 245 N = len(sequences) 246 strings = [] 247 for i in range(N): 248 strings.append('') 249 250 # Printout. 251 sys.stdout.write("\nResidue number based multiple sequence alignment.\n\n") 252 sys.stdout.write("Initial sequences:\n") 253 for i in range(N): 254 sys.stdout.write("%3i %s\n" % (i+1, sequences[i])) 255 256 # The maximum and minimum residue numbers. 257 res_min = 1e100 258 res_max = -1e100 259 for i in range(N): 260 if min(residue_numbers[i]) < res_min: 261 res_min = min(residue_numbers[i]) 262 if max(residue_numbers[i]) > res_max: 263 res_max = max(residue_numbers[i]) 264 265 # The total number of residues. 266 M = res_max - res_min + 1 267 268 # Loop over the molecules and residues and determine if the residue is present. 269 for i in range(N): 270 for res_num in range(res_min, res_max+1): 271 # The residue is present. 272 if res_num in residue_numbers[i]: 273 index = residue_numbers[i].index(res_num) 274 strings[i] += sequences[i][index] 275 276 # A gap. 277 else: 278 strings[i] += '-' 279 280 # Create the gap matrix. 281 gaps = zeros((N, M), int16) 282 for i in range(N): 283 for j in range(M): 284 if strings[i][j] == '-': 285 gaps[i, j] = 1 286 287 # Final printout. 288 sys.stdout.write("\nFinal MSA:\n") 289 for i in range(N): 290 sys.stdout.write("%3i %s\n" % (i+1, strings[i])) 291 292 # Return the results. 293 return strings, gaps

294 295

296 -def msa_residue_skipping(strings=None, gaps=None):

297 """Create the residue skipping data structure. 298 299 @keyword strings: The list of alignment strings. 300 @type strings: list of str 301 @keyword gaps: The gap matrix. 302 @type gaps: numpy rank-2 int array 303 @return: The residue skipping data structure. The first dimension is the molecule and the second is the residue. As opposed to zero, a value of one means the residue should skipped. 304 @rtype: list of lists of int 305 # 306 """ 307 308 # initialise. 309 skip = [] 310 num_mols = len(strings) 311 312 # Loop over each molecule. 313 for mol_index in range(num_mols): 314 skip.append([]) 315 for i in range(len(strings[0])): 316 # Create the empty residue skipping data structure. 317 if strings == None: 318 skip[mol_index].append(0) 319 continue 320 321 # No residue in the current sequence. 322 if gaps[mol_index][i]: 323 continue 324 325 # A gap in one of the other sequences. 326 gap = False 327 for mol_index2 in range(num_mols): 328 if gaps[mol_index2][i]: 329 gap = True 330 331 # Skip the residue. 332 if gap: 333 skip[mol_index].append(1) 334 else: 335 skip[mol_index].append(0) 336 337 # Return the data structure. 338 return skip

339

Source Code for Module lib.sequence_alignment.msa