1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """General sequence alignment functions."""
24
25
26 import sys
27
28
29 from lib.errors import RelaxError
30 from lib.sequence_alignment.needleman_wunsch import needleman_wunsch_align
31 from lib.sequence_alignment.substitution_matrices import BLOSUM62, BLOSUM62_SEQ, PAM250, PAM250_SEQ
32
33
34 -def align_pairwise(sequence1, sequence2, algorithm='NW70', matrix='BLOSUM62', gap_open_penalty=1.0, gap_extend_penalty=1.0, end_gap_open_penalty=0.0, end_gap_extend_penalty=0.0, verbosity=1):
35 """Align two protein sequences.
36
37 @param sequence1: The first protein sequence as one letter codes.
38 @type sequence1: str
39 @param sequence2: The second protein sequence as one letter codes.
40 @type sequence2: str
41 @keyword algorithm: The pairwise sequence alignment algorithm to use.
42 @type algorithm: str
43 @keyword matrix: The substitution matrix to use.
44 @type matrix: str
45 @keyword gap_open_penalty: The penalty for introducing gaps, as a positive number.
46 @type gap_open_penalty: float
47 @keyword gap_extend_penalty: The penalty for extending a gap, as a positive number.
48 @type gap_extend_penalty: float
49 @keyword end_gap_open_penalty: The optional penalty for opening a gap at the end of a sequence.
50 @type end_gap_open_penalty: float
51 @keyword end_gap_extend_penalty: The optional penalty for extending a gap at the end of a sequence.
52 @type end_gap_extend_penalty: float
53 @keyword verbosity: The level of verbosity. Setting this to zero silences all printouts.
54 @type verbosity: int
55 @return: The alignment score, two alignment strings and the gap matrix.
56 @rtype: float, str, str, numpy rank-2 int array
57 """
58
59
60 allowed_algor = ['NW70']
61 if algorithm not in allowed_algor:
62 raise RelaxError("The sequence alignment algorithm '%s' is unknown, it must be one of %s." % (algorithm, allowed_algor))
63 allowed_matrices = ['BLOSUM62', 'PAM250']
64 if matrix not in allowed_matrices:
65 raise RelaxError("The substitution matrix '%s' is unknown, it must be one of %s." % (matrix, allowed_matrices))
66
67
68 sequence1 = sequence1.upper()
69 sequence2 = sequence2.upper()
70
71
72 if verbosity:
73 sys.stdout.write("\nPairwise protein alignment.\n")
74 sys.stdout.write("%-30s %s\n" % ("Substitution matrix:", matrix))
75 sys.stdout.write("%-30s %s\n" % ("Gap opening penalty:", gap_open_penalty))
76 sys.stdout.write("%-30s %s\n" % ("Gap extend penalty:", gap_extend_penalty))
77 sys.stdout.write("\n%-30s %s\n" % ("Input sequence 1:", sequence1))
78 sys.stdout.write("%-30s %s\n" % ("Input sequence 2:", sequence2))
79
80
81 if matrix == 'BLOSUM62':
82 sub_matrix = BLOSUM62
83 sub_seq = BLOSUM62_SEQ
84 elif matrix == 'PAM250':
85 sub_matrix = PAM250
86 sub_seq = PAM250_SEQ
87
88
89 if algorithm == 'NW70':
90 score, align1, align2, gaps = needleman_wunsch_align(sequence1, sequence2, sub_matrix=sub_matrix, sub_seq=sub_seq, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty, end_gap_open_penalty=end_gap_open_penalty, end_gap_extend_penalty=end_gap_extend_penalty)
91
92
93 if verbosity:
94 sys.stdout.write("\n%-30s %s\n" % ("Aligned sequence 1:", align1))
95 sys.stdout.write("%-30s %s\n" % ("Aligned sequence 2:", align2))
96 sys.stdout.write("%-30s " % "")
97 for i in range(len(align1)):
98 if align1[i] == align2[i]:
99 sys.stdout.write("*")
100 else:
101 sys.stdout.write(" ")
102 sys.stdout.write("\n\n")
103
104
105 return score, align1, align2, gaps
106