1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """Module for selecting the best model."""
25
26
27 from math import log
28
29
30 from pipes import get_type, has_pipe, pipe_names, switch
31 from relax_errors import RelaxError, RelaxPipeError
32 from specific_fns.setup import get_specific_fn
33
34
36 """Akaike's Information Criteria (AIC).
37
38 The formula is::
39
40 AIC = chi2 + 2k
41
42
43 @param chi2: The minimised chi-squared value.
44 @type chi2: float
45 @param k: The number of parameters in the model.
46 @type k: int
47 @param n: The dimension of the relaxation data set.
48 @type n: int
49 @return: The AIC value.
50 @rtype: float
51 """
52
53 return chi2 + 2.0*k
54
55
56 -def aicc(chi2, k, n):
57 """Small sample size corrected AIC.
58
59 The formula is::
60
61 2k(k + 1)
62 AICc = chi2 + 2k + ---------
63 n - k - 1
64
65
66 @param chi2: The minimised chi-squared value.
67 @type chi2: float
68 @param k: The number of parameters in the model.
69 @type k: int
70 @param n: The dimension of the relaxation data set.
71 @type n: int
72 @return: The AIC value.
73 @rtype: float
74 """
75
76 if n > (k+1):
77 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0)
78 elif n == (k+1):
79 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation causes a fatal division by zero as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
80 elif n < (k+1):
81 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation produces a negative, and hence nonsense, AICc score as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
82
83
85 """Bayesian or Schwarz Information Criteria.
86
87 The formula is::
88
89 BIC = chi2 + k ln n
90
91
92 @param chi2: The minimised chi-squared value.
93 @type chi2: float
94 @param k: The number of parameters in the model.
95 @type k: int
96 @param n: The dimension of the relaxation data set.
97 @type n: int
98 @return: The AIC value.
99 @rtype: float
100 """
101
102 return chi2 + k * log(n)
103
104
105 -def select(method=None, modsel_pipe=None, pipes=None):
106 """Model selection function.
107
108 @keyword method: The model selection method. This can currently be one of:
109 - 'AIC', Akaike's Information Criteria.
110 - 'AICc', Small sample size corrected AIC.
111 - 'BIC', Bayesian or Schwarz Information Criteria.
112 - 'CV', Single-item-out cross-validation.
113 None of the other model selection techniques are currently supported.
114 @type method: str
115 @keyword modsel_pipe: The name of the new data pipe to be created by copying of the selected
116 data pipe.
117 @type modsel_pipe: str
118 @keyword pipes: A list of the data pipes to use in the model selection.
119 @type pipes: list of str
120 """
121
122
123 if has_pipe(modsel_pipe):
124 raise RelaxPipeError(modsel_pipe)
125
126
127 if pipes == None:
128
129 pipes = pipe_names()
130
131
132 if method == 'AIC':
133 print("AIC model selection.")
134 formula = aic
135 elif method == 'AICc':
136 print("AICc model selection.")
137 formula = aicc
138 elif method == 'BIC':
139 print("BIC model selection.")
140 formula = bic
141 elif method == 'CV':
142 print("CV model selection.")
143 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
144 else:
145 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
146
147
148 if len(pipes) == 0:
149 raise RelaxError("No data pipes are available for use in model selection.")
150
151
152 function_type = {}
153 model_loop = {}
154 model_type = {}
155 duplicate_data = {}
156 model_statistics = {}
157 skip_function = {}
158 modsel_pipe_exists = False
159
160
161 if isinstance(pipes[0], list):
162
163 if len(pipes[0]) == 0:
164 raise RelaxError("No pipes are available for use in model selection in the array " + repr(pipes[0]) + ".")
165
166
167 for i in xrange(len(pipes)):
168 for j in xrange(len(pipes[i])):
169
170 model_loop[pipes[i][j]] = get_specific_fn('model_loop', get_type(pipes[i][j]))
171 model_type[pipes[i][j]] = get_specific_fn('model_type', get_type(pipes[i][j]))
172 duplicate_data[pipes[i][j]] = get_specific_fn('duplicate_data', get_type(pipes[i][j]))
173 model_statistics[pipes[i][j]] = get_specific_fn('model_stats', get_type(pipes[i][j]))
174 skip_function[pipes[i][j]] = get_specific_fn('skip_function', get_type(pipes[i][j]))
175
176
177 for i in xrange(len(pipes)):
178 for j in xrange(len(pipes[i])):
179 if model_loop[pipes[0][j]] != model_loop[pipes[i][j]]:
180 raise RelaxError("The models for each data pipes should be the same.")
181 model_loop = model_loop[pipes[0][0]]
182
183
184 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
185
186
187 global_flag = False
188 for i in xrange(len(pipes)):
189 for j in xrange(len(pipes[i])):
190 if model_type[pipes[i][j]]() == 'global':
191 global_flag = True
192
193
194 else:
195
196 for i in xrange(len(pipes)):
197
198 model_loop[pipes[i]] = get_specific_fn('model_loop', get_type(pipes[i]))
199 model_type[pipes[i]] = get_specific_fn('model_type', get_type(pipes[i]))
200 duplicate_data[pipes[i]] = get_specific_fn('duplicate_data', get_type(pipes[i]))
201 model_statistics[pipes[i]] = get_specific_fn('model_stats', get_type(pipes[i]))
202 skip_function[pipes[i]] = get_specific_fn('skip_function', get_type(pipes[i]))
203
204 model_loop = model_loop[pipes[0]]
205
206
207 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
208
209
210 global_flag = False
211 for j in xrange(len(pipes)):
212 if model_type[pipes[j]]() == 'global':
213 global_flag = True
214
215
216
217 for model_info in model_loop():
218
219 print(("\n" + model_desc(model_info)))
220 print(("%-20s %-20s %-20s %-20s %-20s" % ("Data pipe", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion")))
221
222
223 best_model = None
224 best_crit = 1e300
225
226
227 for j in xrange(len(pipes)):
228
229 if method == 'CV':
230
231 sum_crit = 0.0
232
233
234 for k in xrange(len(pipes[j])):
235
236 pipe = pipes[j][k]
237
238
239 switch(pipe)
240
241
242 if skip_function[pipe](model_info):
243 continue
244
245
246 k, n, chi2 = model_statistics[pipe](model_info)
247
248
249 if k == None or n == None or chi2 == None:
250 continue
251
252
253 sum_crit = sum_crit + chi2
254
255
256 crit = sum_crit / float(len(pipes[j]))
257
258
259 else:
260
261 pipe = pipes[j]
262
263
264 switch(pipe)
265
266
267 if skip_function[pipe](model_info):
268 continue
269
270
271 k, n, chi2 = model_statistics[pipe](model_info, global_stats=global_flag)
272
273
274 if k == None or n == None or chi2 == None:
275 continue
276
277
278 crit = formula(chi2, float(k), float(n))
279
280
281 print(("%-20s %-20i %-20i %-20.5f %-20.5f" % (pipe, k, n, chi2, crit)))
282
283
284 if crit < best_crit:
285 best_model = pipe
286 best_crit = crit
287
288
289 if best_model != None:
290
291 print(("The model from the data pipe " + repr(best_model) + " has been selected."))
292
293
294 switch(best_model)
295
296
297 duplicate_data[best_model](best_model, modsel_pipe, model_info, global_stats=global_flag, verbose=False)
298
299
300 modsel_pipe_exists = True
301
302
303 else:
304
305 print("No model has been selected.")
306
307
308 if modsel_pipe_exists:
309 switch(modsel_pipe)
310