1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23  """Multiple sequence alignment (MSA) algorithms.""" 
 24   
 25   
 26  from numpy import float64, int16, zeros 
 27  import sys 
 28   
 29   
 30  from lib.errors import RelaxError 
 31  from lib.sequence_alignment.align_protein import align_pairwise 
 32   
 33   
 34 -def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): 
  35      """Align multiple protein sequences to one reference by fusing multiple pairwise alignments. 
 36   
 37      @param sequences:                   The list of residue sequences as one letter codes. 
 38      @type sequences:                    list of str 
 39      @keyword algorithm:                 The pairwise sequence alignment algorithm to use. 
 40      @type algorithm:                    str 
 41      @keyword matrix:                    The substitution matrix to use. 
 42      @type matrix:                       str 
 43      @keyword gap_open_penalty:          The penalty for introducing gaps, as a positive number. 
 44      @type gap_open_penalty:             float 
 45      @keyword gap_extend_penalty:        The penalty for extending a gap, as a positive number. 
 46      @type gap_extend_penalty:           float 
 47      @keyword end_gap_open_penalty:      The optional penalty for opening a gap at the end of a sequence. 
 48      @type end_gap_open_penalty:         float 
 49      @keyword end_gap_extend_penalty:    The optional penalty for extending a gap at the end of a sequence. 
 50      @type end_gap_extend_penalty:       float 
 51      @return:                            The list of alignment strings and the gap matrix. 
 52      @rtype:                             list of str, numpy rank-2 int array 
 53      """ 
 54   
 55       
 56      N = len(sequences) 
 57      scores = zeros((N, N), float64) 
 58   
 59       
 60      align1_matrix = [] 
 61      align2_matrix = [] 
 62      for i in range(N): 
 63          align1_matrix.append([]) 
 64          align2_matrix.append([]) 
 65          for j in range(N): 
 66              if i == j: 
 67                  align1_matrix[i].append(sequences[i]) 
 68                  align2_matrix[i].append(sequences[i]) 
 69              else: 
 70                  align1_matrix[i].append(None) 
 71                  align2_matrix[i].append(None) 
 72   
 73       
 74      sys.stdout.write("\nCentral Star multiple sequence alignment.\n\n") 
 75      sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm)) 
 76      sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix)) 
 77      sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty)) 
 78      sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty)) 
 79      sys.stdout.write("Initial sequences:\n") 
 80      for i in range(N): 
 81          sys.stdout.write("%3i %s\n" % (i+1, sequences[i])) 
 82   
 83       
 84      sys.stdout.write("\nDetermining the scores for all pairwise alignments:\n") 
 85      for i in range(N): 
 86          for j in range(i+1, N): 
 87               
 88              sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i+1, j+1))) 
 89              score, align1, align2, gaps = align_pairwise(sequences[i], sequences[j], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) 
 90              sys.stdout.write("%10.1f\n" % score) 
 91   
 92               
 93              scores[i, j] = scores[j, i] = score 
 94              align1_matrix[i][j] = align1_matrix[j][i] = align1 
 95              align2_matrix[i][j] = align2_matrix[j][i] = align2 
 96   
 97       
 98      sys.stdout.write("\nDetermining the central sequence:\n") 
 99      sum_scores = scores.sum(0) 
100      Sc_sum_score = 1e100 
101      Sc_index = 0 
102      for i in range(N): 
103          if sum_scores[i] < Sc_sum_score: 
104              Sc_sum_score = sum_scores[i] 
105              Sc_index = i 
106          sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" % (i+1)), sum_scores[i])) 
107      sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index+1)) 
108   
109       
110      Sc = sequences[Sc_index] 
111      Si = [] 
112      for i in range(N): 
113          if i != Sc_index: 
114              Si.append(sequences[i]) 
115   
116       
117      sys.stdout.write("\nDetermining the iterative optimal alignments:\n") 
118      Sc_prime = Sc 
119      string_lists = [] 
120      for i in range(N-1): 
121           
122          string_lists.append([]) 
123   
124           
125          score, Sc_prime, Si_prime, gaps = align_pairwise(Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0) 
126          sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", Sc_prime.replace('X', '-'))) 
127          sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i+1)), Si_prime.replace('X', '-'))) 
128   
129           
130          for j in range(len(Sc_prime)): 
131              string_lists[i].append(Si_prime[j]) 
132   
133           
134          else: 
135               
136              for j in range(len(Sc_prime)): 
137                  if Sc_prime[j] == '-': 
138                       
139                      for k in range(0, i): 
140                          string_lists[k].insert(j, '-') 
141   
142       
143      strings = [] 
144      M = len(Sc_prime) 
145      strings.append(Sc_prime) 
146      for i in range(N-1): 
147          strings.append(''.join(string_lists[i])) 
148      for i in range(N): 
149          strings[i] = strings[i].replace('X', '-') 
150   
151       
152      string = strings.pop(0) 
153      strings.insert(Sc_index, string) 
154   
155       
156      gaps = zeros((N, M), int16) 
157      for i in range(N): 
158          for j in range(M): 
159              if strings[i][j] == '-': 
160                  gaps[i, j] = 1 
161   
162       
163      sys.stdout.write("\nFinal MSA:\n") 
164      for i in range(N): 
165          sys.stdout.write("%3i %s\n" % (i+1, strings[i])) 
166   
167       
168      return strings, gaps 
 169   
170   
171 -def msa_general(sequences, residue_numbers=None, msa_algorithm='Central Star', pairwise_algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0): 
 172      """General interface for multiple sequence alignments (MSA). 
173   
174      This can be used to select between the following MSA algorithms: 
175   
176          - 'Central Star', to use the central_star() function. 
177          - 'residue number', to use the msa_residue_numbers() function. 
178   
179   
180      @param sequences:                   The list of residue sequences as one letter codes. 
181      @type sequences:                    list of str 
182      @keyword residue_numbers:           The list of residue numbers for each sequence. 
183      @type residue_numbers:              list of list of int 
184      @keyword msa_algorithm:             The multiple sequence alignment (MSA) algorithm to use. 
185      @type msa_algorithm:                str 
186      @keyword pairwise_algorithm:        The pairwise sequence alignment algorithm to use. 
187      @type pairwise_algorithm:           str 
188      @keyword matrix:                    The substitution matrix to use. 
189      @type matrix:                       str 
190      @keyword gap_open_penalty:          The penalty for introducing gaps, as a positive number. 
191      @type gap_open_penalty:             float 
192      @keyword gap_extend_penalty:        The penalty for extending a gap, as a positive number. 
193      @type gap_extend_penalty:           float 
194      @keyword end_gap_open_penalty:      The optional penalty for opening a gap at the end of a sequence. 
195      @type end_gap_open_penalty:         float 
196      @keyword end_gap_extend_penalty:    The optional penalty for extending a gap at the end of a sequence. 
197      @type end_gap_extend_penalty:       float 
198      @return:                            The list of alignment strings and the gap matrix. 
199      @rtype:                             list of str, numpy rank-2 int array 
200      """ 
201   
202       
203      allowed_msa = ['Central Star', 'residue number'] 
204      if msa_algorithm not in allowed_msa: 
205          raise RelaxError("The MSA algorithm must be one of %s." % allowed_msa) 
206   
207       
208      if gap_open_penalty != None: 
209          if gap_open_penalty < 0.0: 
210              raise RelaxError("The gap opening penalty %s must be a positive number." % gap_open_penalty) 
211      if gap_extend_penalty != None: 
212          if gap_extend_penalty < 0.0: 
213              raise RelaxError("The gap extension penalty %s must be a positive number." % gap_extend_penalty) 
214      if end_gap_open_penalty != None: 
215          if end_gap_open_penalty < 0.0: 
216              raise RelaxError("The end gap opening penalty %s must be a positive number." % end_gap_open_penalty) 
217      if end_gap_extend_penalty != None: 
218          if end_gap_extend_penalty < 0.0: 
219              raise RelaxError("The end gap extension penalty %s must be a positive number." % end_gap_extend_penalty) 
220   
221       
222      if msa_algorithm == 'Central Star': 
223          strings, gaps = central_star(sequences, algorithm=pairwise_algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty) 
224   
225       
226      elif msa_algorithm == 'residue number': 
227          strings, gaps = msa_residue_numbers(sequences, residue_numbers=residue_numbers) 
228   
229       
230      return strings, gaps 
 231   
232   
234      """Align multiple sequences based on the residue numbering. 
235   
236      @param sequences:                   The list of residue sequences as one letter codes. 
237      @type sequences:                    list of str 
238      @keyword residue_numbers:           The list of residue numbers for each sequence. 
239      @type residue_numbers:              list of list of int 
240      @return:                            The list of alignment strings and the gap matrix. 
241      @rtype:                             list of str, numpy rank-2 int array 
242      """ 
243   
244       
245      N = len(sequences) 
246      strings = [] 
247      for i in range(N): 
248          strings.append('') 
249   
250       
251      sys.stdout.write("\nResidue number based multiple sequence alignment.\n\n") 
252      sys.stdout.write("Initial sequences:\n") 
253      for i in range(N): 
254          sys.stdout.write("%3i %s\n" % (i+1, sequences[i])) 
255   
256       
257      res_min = 1e100 
258      res_max = -1e100 
259      for i in range(N): 
260          if min(residue_numbers[i]) < res_min: 
261              res_min = min(residue_numbers[i]) 
262          if max(residue_numbers[i]) > res_max: 
263              res_max = max(residue_numbers[i]) 
264   
265       
266      M = res_max - res_min + 1 
267   
268       
269      for i in range(N): 
270          for res_num in range(res_min, res_max+1): 
271               
272              if res_num in residue_numbers[i]: 
273                  index = residue_numbers[i].index(res_num) 
274                  strings[i] += sequences[i][index] 
275   
276               
277              else: 
278                  strings[i] += '-' 
279   
280       
281      gaps = zeros((N, M), int16) 
282      for i in range(N): 
283          for j in range(M): 
284              if strings[i][j] == '-': 
285                  gaps[i, j] = 1 
286   
287       
288      sys.stdout.write("\nFinal MSA:\n") 
289      for i in range(N): 
290          sys.stdout.write("%3i %s\n" % (i+1, strings[i])) 
291   
292       
293      return strings, gaps 
 294   
295   
297      """Create the residue skipping data structure.  
298   
299      @keyword strings:   The list of alignment strings. 
300      @type strings:      list of str 
301      @keyword gaps:      The gap matrix. 
302      @type gaps:         numpy rank-2 int array 
303      @return:            The residue skipping data structure.  The first dimension is the molecule and the second is the residue.  As opposed to zero, a value of one means the residue should skipped. 
304      @rtype:             list of lists of int 
305      #  
306      """ 
307   
308       
309      skip = [] 
310      num_mols = len(strings) 
311   
312       
313      for mol_index in range(num_mols): 
314          skip.append([]) 
315          for i in range(len(strings[0])): 
316               
317              if strings == None: 
318                  skip[mol_index].append(0) 
319                  continue 
320   
321               
322              if gaps[mol_index][i]: 
323                  continue 
324   
325               
326              gap = False 
327              for mol_index2 in range(num_mols): 
328                  if gaps[mol_index2][i]: 
329                      gap = True 
330   
331               
332              if gap: 
333                  skip[mol_index].append(1) 
334              else: 
335                  skip[mol_index].append(0) 
336   
337       
338      return skip 
 339