1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """Multiple sequence alignment (MSA) algorithms."""
24
25
26 from numpy import float64, int16, zeros
27 import sys
28
29
30 from lib.errors import RelaxError
31 from lib.sequence_alignment.align_protein import align_pairwise
32
33
34 -def central_star(sequences, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0):
35 """Align multiple protein sequences to one reference by fusing multiple pairwise alignments.
36
37 @param sequences: The list of residue sequences as one letter codes.
38 @type sequences: list of str
39 @keyword algorithm: The pairwise sequence alignment algorithm to use.
40 @type algorithm: str
41 @keyword matrix: The substitution matrix to use.
42 @type matrix: str
43 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number.
44 @type gap_open_penalty: float
45 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number.
46 @type gap_extend_penalty: float
47 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence.
48 @type end_gap_open_penalty: float
49 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence.
50 @type end_gap_extend_penalty: float
51 @return: The list of alignment strings and the gap matrix.
52 @rtype: list of str, numpy rank-2 int array
53 """
54
55
56 N = len(sequences)
57 scores = zeros((N, N), float64)
58
59
60 align1_matrix = []
61 align2_matrix = []
62 for i in range(N):
63 align1_matrix.append([])
64 align2_matrix.append([])
65 for j in range(N):
66 if i == j:
67 align1_matrix[i].append(sequences[i])
68 align2_matrix[i].append(sequences[i])
69 else:
70 align1_matrix[i].append(None)
71 align2_matrix[i].append(None)
72
73
74 sys.stdout.write("\nCentral Star multiple sequence alignment.\n\n")
75 sys.stdout.write("%-30s %s\n" % ("Pairwise algorithm:", algorithm))
76 sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix))
77 sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty))
78 sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty))
79 sys.stdout.write("Initial sequences:\n")
80 for i in range(N):
81 sys.stdout.write("%3i %s\n" % (i+1, sequences[i]))
82
83
84 sys.stdout.write("\nDetermining the scores for all pairwise alignments:\n")
85 for i in range(N):
86 for j in range(i+1, N):
87
88 sys.stdout.write("%-30s " % ("Sequences %i-%i:" % (i+1, j+1)))
89 score, align1, align2, gaps = align_pairwise(sequences[i], sequences[j], algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0)
90 sys.stdout.write("%10.1f\n" % score)
91
92
93 scores[i, j] = scores[j, i] = score
94 align1_matrix[i][j] = align1_matrix[j][i] = align1
95 align2_matrix[i][j] = align2_matrix[j][i] = align2
96
97
98 sys.stdout.write("\nDetermining the central sequence:\n")
99 sum_scores = scores.sum(0)
100 Sc_sum_score = 1e100
101 Sc_index = 0
102 for i in range(N):
103 if sum_scores[i] < Sc_sum_score:
104 Sc_sum_score = sum_scores[i]
105 Sc_index = i
106 sys.stdout.write("%-30s %10.1f\n" % (("Sum of scores, sequence %i:" % (i+1)), sum_scores[i]))
107 sys.stdout.write("%-30s %i\n" % ("Central sequence:", Sc_index+1))
108
109
110 Sc = sequences[Sc_index]
111 Si = []
112 for i in range(N):
113 if i != Sc_index:
114 Si.append(sequences[i])
115
116
117 sys.stdout.write("\nDetermining the iterative optimal alignments:\n")
118 Sc_prime = Sc
119 string_lists = []
120 for i in range(N-1):
121
122 string_lists.append([])
123
124
125 score, Sc_prime, Si_prime, gaps = align_pairwise(Sc_prime.replace('-', 'X'), Si[i].replace('-', 'X'), algorithm=algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty, verbosity=0)
126 sys.stdout.write("\n%-30s %s\n" % ("Sequence Sc':", Sc_prime.replace('X', '-')))
127 sys.stdout.write("%-30s %s\n" % (("Sequence S%i':" % (i+1)), Si_prime.replace('X', '-')))
128
129
130 for j in range(len(Sc_prime)):
131 string_lists[i].append(Si_prime[j])
132
133
134 else:
135
136 for j in range(len(Sc_prime)):
137 if Sc_prime[j] == '-':
138
139 for k in range(0, i):
140 string_lists[k].insert(j, '-')
141
142
143 strings = []
144 M = len(Sc_prime)
145 strings.append(Sc_prime)
146 for i in range(N-1):
147 strings.append(''.join(string_lists[i]))
148 for i in range(N):
149 strings[i] = strings[i].replace('X', '-')
150
151
152 string = strings.pop(0)
153 strings.insert(Sc_index, string)
154
155
156 gaps = zeros((N, M), int16)
157 for i in range(N):
158 for j in range(M):
159 if strings[i][j] == '-':
160 gaps[i, j] = 1
161
162
163 sys.stdout.write("\nFinal MSA:\n")
164 for i in range(N):
165 sys.stdout.write("%3i %s\n" % (i+1, strings[i]))
166
167
168 return strings, gaps
169
170
171 -def msa_general(sequences, residue_numbers=None, msa_algorithm='Central Star', pairwise_algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0):
172 """General interface for multiple sequence alignments (MSA).
173
174 This can be used to select between the following MSA algorithms:
175
176 - 'Central Star', to use the central_star() function.
177 - 'residue number', to use the msa_residue_numbers() function.
178
179
180 @param sequences: The list of residue sequences as one letter codes.
181 @type sequences: list of str
182 @keyword residue_numbers: The list of residue numbers for each sequence.
183 @type residue_numbers: list of list of int
184 @keyword msa_algorithm: The multiple sequence alignment (MSA) algorithm to use.
185 @type msa_algorithm: str
186 @keyword pairwise_algorithm: The pairwise sequence alignment algorithm to use.
187 @type pairwise_algorithm: str
188 @keyword matrix: The substitution matrix to use.
189 @type matrix: str
190 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number.
191 @type gap_open_penalty: float
192 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number.
193 @type gap_extend_penalty: float
194 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence.
195 @type end_gap_open_penalty: float
196 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence.
197 @type end_gap_extend_penalty: float
198 @return: The list of alignment strings and the gap matrix.
199 @rtype: list of str, numpy rank-2 int array
200 """
201
202
203 allowed_msa = ['Central Star', 'residue number']
204 if msa_algorithm not in allowed_msa:
205 raise RelaxError("The MSA algorithm must be one of %s." % allowed_msa)
206
207
208 if gap_open_penalty != None:
209 if gap_open_penalty < 0.0:
210 raise RelaxError("The gap opening penalty %s must be a positive number." % gap_open_penalty)
211 if gap_extend_penalty != None:
212 if gap_extend_penalty < 0.0:
213 raise RelaxError("The gap extension penalty %s must be a positive number." % gap_extend_penalty)
214 if end_gap_open_penalty != None:
215 if end_gap_open_penalty < 0.0:
216 raise RelaxError("The end gap opening penalty %s must be a positive number." % end_gap_open_penalty)
217 if end_gap_extend_penalty != None:
218 if end_gap_extend_penalty < 0.0:
219 raise RelaxError("The end gap extension penalty %s must be a positive number." % end_gap_extend_penalty)
220
221
222 if msa_algorithm == 'Central Star':
223 strings, gaps = central_star(sequences, algorithm=pairwise_algorithm, matrix=matrix, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty)
224
225
226 elif msa_algorithm == 'residue number':
227 strings, gaps = msa_residue_numbers(sequences, residue_numbers=residue_numbers)
228
229
230 return strings, gaps
231
232
234 """Align multiple sequences based on the residue numbering.
235
236 @param sequences: The list of residue sequences as one letter codes.
237 @type sequences: list of str
238 @keyword residue_numbers: The list of residue numbers for each sequence.
239 @type residue_numbers: list of list of int
240 @return: The list of alignment strings and the gap matrix.
241 @rtype: list of str, numpy rank-2 int array
242 """
243
244
245 N = len(sequences)
246 strings = []
247 for i in range(N):
248 strings.append('')
249
250
251 sys.stdout.write("\nResidue number based multiple sequence alignment.\n\n")
252 sys.stdout.write("Initial sequences:\n")
253 for i in range(N):
254 sys.stdout.write("%3i %s\n" % (i+1, sequences[i]))
255
256
257 res_min = 1e100
258 res_max = -1e100
259 for i in range(N):
260 if min(residue_numbers[i]) < res_min:
261 res_min = min(residue_numbers[i])
262 if max(residue_numbers[i]) > res_max:
263 res_max = max(residue_numbers[i])
264
265
266 M = res_max - res_min + 1
267
268
269 for i in range(N):
270 for res_num in range(res_min, res_max+1):
271
272 if res_num in residue_numbers[i]:
273 index = residue_numbers[i].index(res_num)
274 strings[i] += sequences[i][index]
275
276
277 else:
278 strings[i] += '-'
279
280
281 gaps = zeros((N, M), int16)
282 for i in range(N):
283 for j in range(M):
284 if strings[i][j] == '-':
285 gaps[i, j] = 1
286
287
288 sys.stdout.write("\nFinal MSA:\n")
289 for i in range(N):
290 sys.stdout.write("%3i %s\n" % (i+1, strings[i]))
291
292
293 return strings, gaps
294
295
297 """Create the residue skipping data structure.
298
299 @keyword strings: The list of alignment strings.
300 @type strings: list of str
301 @keyword gaps: The gap matrix.
302 @type gaps: numpy rank-2 int array
303 @return: The residue skipping data structure. The first dimension is the molecule and the second is the residue. As opposed to zero, a value of one means the residue should skipped.
304 @rtype: list of lists of int
305 #
306 """
307
308
309 skip = []
310 num_mols = len(strings)
311
312
313 for mol_index in range(num_mols):
314 skip.append([])
315 for i in range(len(strings[0])):
316
317 if strings == None:
318 skip[mol_index].append(0)
319 continue
320
321
322 if gaps[mol_index][i]:
323 continue
324
325
326 gap = False
327 for mol_index2 in range(num_mols):
328 if gaps[mol_index2][i]:
329 gap = True
330
331
332 if gap:
333 skip[mol_index].append(1)
334 else:
335 skip[mol_index].append(0)
336
337
338 return skip
339