-
Notifications
You must be signed in to change notification settings - Fork 1
/
20080523a.py
203 lines (189 loc) · 7.2 KB
/
20080523a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""Get the maximum likelihood rate for each column of a nucleotide alignment.
"""
from StringIO import StringIO
from scipy import optimize
from SnippetUtil import HandlingError
import SnippetUtil
import Newick
import Fasta
import RateMatrix
import PhyLikelihood
import Stockholm
import Discretizer
import Form
import FormOut
def get_stockholm_string(tree, alignment, mle_rates):
"""
@param alignment: a nucleotide alignment
@param mle_rates: a mle rate for each column
@return: a string that represents a file in stockholm format
"""
# define the bins
max_categories = 10
discretizer = Discretizer.Discretizer(mle_rates, max_categories)
boundaries = discretizer.get_boundaries()
# determine to which bin each rate belongs
categorized_rates = []
for rate in mle_rates:
for i, (low, high) in enumerate(boundaries):
if low <= rate <= high:
categorized_rates.append(i)
continue
# define the annotation string
annotation_name = 'MLE_rate'
annotation_value = ''.join(str(x) for x in categorized_rates)
# define the stockholm object
stockholm = Stockholm.Stockholm()
stockholm.tree = tree
stockholm.alignment = alignment
# add comments and annotations to the stockholm object
stockholm.add_comment('MLE_rate ranges:')
for i, b in enumerate(boundaries):
stockholm.add_comment(str(i) + ': ' + str(b))
stockholm.add_column_annotation(annotation_name, annotation_value)
return str(stockholm)
class Objective:
"""
This is a function object that evaluates the likelihood of various rates.
"""
def __init__(self, tree, rate_matrix):
"""
@param tree: a phylogenetic tree with branch lengths and annotated tips
@param rate_matrix: a rate matrix used to calculate column likelihoods
"""
self.tree = tree
self.rate_matrix = rate_matrix
def __call__(self, rate):
"""
Return the negative likelihood of a column.
The negative likelihood is computed using
the tree, matrix, and rate.
@param rate: the rate of the rate matrix
@return: the negative likelihood of the column
"""
if not rate:
inf = float('inf')
neginf = float('-inf')
states = [tip.state for tip in self.tree.gen_tips()]
if len(set(states)) == 1:
likelihood = 1
else:
likelihood = 0
else:
self.rate_matrix.set_rate(rate)
likelihood = RateMatrix.get_likelihood(self.tree, self.rate_matrix)
return -likelihood
def get_mle_rates(tree, alignment, rate_matrix):
"""
@param tree: a tree with branch lengths
@param alignment: a nucleotide alignment
@param rate_matrix: a nucleotide rate matrix object
@return: a list giving the maximum likelihood rate for each column
"""
# define the objective function
objective_function = Objective(tree, rate_matrix)
# create the cache so each unique column is evaluated only once
column_to_mle_rate = {}
# look for maximum likelihood rates
mle_rates = []
for column in alignment.columns:
column_tuple = tuple(column)
if column_tuple in column_to_mle_rate:
mle_rate = column_to_mle_rate[column_tuple]
else:
if len(set(column)) == 1:
# If the column is homogeneous
# then we know that the mle rate is zero.
mle_rate = 0
else:
# redecorate the tree with nucleotide states at the tips
name_to_state = dict(zip(alignment.headers, column))
for tip in tree.gen_tips():
tip.state = name_to_state[tip.name]
# Get the maximum likelihood rate for the column
# using a golden section search.
# The bracket is just a suggestion.
bracket = (0.51, 2.01)
mle_rate = optimize.golden(
objective_function, brack=bracket)
column_to_mle_rate[column_tuple] = mle_rate
mle_rates.append(mle_rate)
return mle_rates
def get_form():
"""
@return: the body of a form
"""
# define the default tree string
tree_string = Newick.brown_example_tree
tree = Newick.parse(tree_string, Newick.NewickTree)
formatted_tree_string = Newick.get_narrow_newick_string(tree, 60)
# define the default alignment string
default_alignment_string = Fasta.brown_example_alignment.strip()
# define the default nucleotide distribution weights
default_nt_lines = [
'A : 1',
'C : 1',
'G : 1',
'T : 1']
# define the form objects
form_objects = [
Form.MultiLine('tree', 'newick tree', formatted_tree_string),
Form.MultiLine('alignment', 'nucleotide alignment',
default_alignment_string),
Form.MultiLine('weights', 'nucleotide weights',
'\n'.join(default_nt_lines)),
Form.Float('kappa', 'transition transversion rate ratio',
2, low_inclusive=0)]
return form_objects
def get_form_out():
return FormOut.Stockholm()
def get_response_content(fs):
# get the tree
tree = Newick.parse(fs.tree, Newick.NewickTree)
tree.assert_valid()
# get the nucleotide distribution
distribution = SnippetUtil.get_distribution(
fs.weights, 'nucleotide', list('ACGT'))
# get the nucleotide alignment
try:
alignment = Fasta.Alignment(StringIO(fs.alignment))
alignment.force_nucleotide()
except Fasta.AlignmentError as e:
raise HandlingError(e)
# get the rate matrix defined by the nucleotide distribution and kappa
row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
distribution, fs.kappa).get_row_major_rate_matrix()
rate_matrix = RateMatrix.FastRateMatrix(
row_major_rate_matrix, list('ACGT'))
rate_matrix.normalize()
# get the mle rates
mle_rates = get_mle_rates(tree, alignment, rate_matrix)
# return the response
return get_stockholm_string(tree, alignment, mle_rates) + '\n'
def main():
# create the alignment object
print 'creating the alignment...'
alignment_string = Fasta.brown_example_alignment.strip()
alignment = Fasta.Alignment(StringIO(alignment_string))
# create a tree object
print 'creating the tree...'
tree_string = Newick.brown_example_tree
tree = Newick.parse(tree_string, Newick.NewickTree)
# create a rate matrix object
print 'creating the rate matrix object...'
distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25}
kappa = 2.0
row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
distribution, kappa).get_row_major_rate_matrix()
rate_matrix = RateMatrix.FastRateMatrix(
row_major_rate_matrix, list('ACGT'))
rate_matrix.normalize()
# get the mle_rates
print 'getting the mle rates...'
mle_rates = get_mle_rates(tree, alignment, rate_matrix)
print 'mle rates:'
print mle_rates
print 'stockholm string:'
print get_stockholm_string(tree, alignment, mle_rates)
if __name__ == '__main__':
main()