1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """Module for selecting the best model."""
25
26
27 from math import log
28
29
30 import generic_fns.pipes
31 from generic_fns.pipes import get_type, has_pipe, pipe_names, switch
32 from relax_errors import RelaxError, RelaxPipeError
33 from specific_fns.setup import get_specific_fn
34
35
37 """Akaike's Information Criteria (AIC).
38
39 The formula is::
40
41 AIC = chi2 + 2k
42
43
44 @param chi2: The minimised chi-squared value.
45 @type chi2: float
46 @param k: The number of parameters in the model.
47 @type k: int
48 @param n: The dimension of the relaxation data set.
49 @type n: int
50 @return: The AIC value.
51 @rtype: float
52 """
53
54 return chi2 + 2.0*k
55
56
57 -def aicc(chi2, k, n):
58 """Small sample size corrected AIC.
59
60 The formula is::
61
62 2k(k + 1)
63 AICc = chi2 + 2k + ---------
64 n - k - 1
65
66
67 @param chi2: The minimised chi-squared value.
68 @type chi2: float
69 @param k: The number of parameters in the model.
70 @type k: int
71 @param n: The dimension of the relaxation data set.
72 @type n: int
73 @return: The AIC value.
74 @rtype: float
75 """
76
77 if n > (k+1):
78 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0)
79 elif n == (k+1):
80 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation causes a fatal division by zero as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
81 elif n < (k+1):
82 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation produces a negative, and hence nonsense, AICc score as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
83
84
86 """Bayesian or Schwarz Information Criteria.
87
88 The formula is::
89
90 BIC = chi2 + k ln n
91
92
93 @param chi2: The minimised chi-squared value.
94 @type chi2: float
95 @param k: The number of parameters in the model.
96 @type k: int
97 @param n: The dimension of the relaxation data set.
98 @type n: int
99 @return: The AIC value.
100 @rtype: float
101 """
102
103 return chi2 + k * log(n)
104
105
106 -def select(method=None, modsel_pipe=None, bundle=None, pipes=None):
107 """Model selection function.
108
109 @keyword method: The model selection method. This can currently be one of:
110 - 'AIC', Akaike's Information Criteria.
111 - 'AICc', Small sample size corrected AIC.
112 - 'BIC', Bayesian or Schwarz Information Criteria.
113 - 'CV', Single-item-out cross-validation.
114 None of the other model selection techniques are currently supported.
115 @type method: str
116 @keyword modsel_pipe: The name of the new data pipe to be created by copying of the selected data pipe.
117 @type modsel_pipe: str
118 @keyword bundle: The optional data pipe bundle to associate the newly created pipe with.
119 @type bundle: str or None
120 @keyword pipes: A list of the data pipes to use in the model selection.
121 @type pipes: list of str
122 """
123
124
125 if has_pipe(modsel_pipe):
126 raise RelaxPipeError(modsel_pipe)
127
128
129 if pipes == None:
130
131 pipes = pipe_names()
132
133
134 if method == 'AIC':
135 print("AIC model selection.")
136 formula = aic
137 elif method == 'AICc':
138 print("AICc model selection.")
139 formula = aicc
140 elif method == 'BIC':
141 print("BIC model selection.")
142 formula = bic
143 elif method == 'CV':
144 print("CV model selection.")
145 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
146 else:
147 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
148
149
150 if len(pipes) == 0:
151 raise RelaxError("No data pipes are available for use in model selection.")
152
153
154 function_type = {}
155 model_loop = {}
156 model_type = {}
157 duplicate_data = {}
158 model_statistics = {}
159 skip_function = {}
160 modsel_pipe_exists = False
161
162
163 if isinstance(pipes[0], list):
164
165 if len(pipes[0]) == 0:
166 raise RelaxError("No pipes are available for use in model selection in the array " + repr(pipes[0]) + ".")
167
168
169 for i in xrange(len(pipes)):
170 for j in xrange(len(pipes[i])):
171
172 model_loop[pipes[i][j]] = get_specific_fn('model_loop', get_type(pipes[i][j]))
173 model_type[pipes[i][j]] = get_specific_fn('model_type', get_type(pipes[i][j]))
174 duplicate_data[pipes[i][j]] = get_specific_fn('duplicate_data', get_type(pipes[i][j]))
175 model_statistics[pipes[i][j]] = get_specific_fn('model_stats', get_type(pipes[i][j]))
176 skip_function[pipes[i][j]] = get_specific_fn('skip_function', get_type(pipes[i][j]))
177
178
179 for i in xrange(len(pipes)):
180 for j in xrange(len(pipes[i])):
181 if model_loop[pipes[0][j]] != model_loop[pipes[i][j]]:
182 raise RelaxError("The models for each data pipes should be the same.")
183 model_loop = model_loop[pipes[0][0]]
184
185
186 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
187
188
189 global_flag = False
190 for i in xrange(len(pipes)):
191 for j in xrange(len(pipes[i])):
192 if model_type[pipes[i][j]]() == 'global':
193 global_flag = True
194
195
196 else:
197
198 for i in xrange(len(pipes)):
199
200 model_loop[pipes[i]] = get_specific_fn('model_loop', get_type(pipes[i]))
201 model_type[pipes[i]] = get_specific_fn('model_type', get_type(pipes[i]))
202 duplicate_data[pipes[i]] = get_specific_fn('duplicate_data', get_type(pipes[i]))
203 model_statistics[pipes[i]] = get_specific_fn('model_stats', get_type(pipes[i]))
204 skip_function[pipes[i]] = get_specific_fn('skip_function', get_type(pipes[i]))
205
206 model_loop = model_loop[pipes[0]]
207
208
209 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
210
211
212 global_flag = False
213 for j in xrange(len(pipes)):
214 if model_type[pipes[j]]() == 'global':
215 global_flag = True
216
217
218
219 for model_info in model_loop():
220
221 print(("\n" + model_desc(model_info)))
222 print(("%-20s %-20s %-20s %-20s %-20s" % ("Data pipe", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion")))
223
224
225 best_model = None
226 best_crit = 1e300
227
228
229 for j in xrange(len(pipes)):
230
231 if method == 'CV':
232
233 sum_crit = 0.0
234
235
236 for k in xrange(len(pipes[j])):
237
238 pipe = pipes[j][k]
239
240
241 switch(pipe)
242
243
244 if skip_function[pipe](model_info):
245 continue
246
247
248 k, n, chi2 = model_statistics[pipe](model_info)
249
250
251 if k == None or n == None or chi2 == None:
252 continue
253
254
255 sum_crit = sum_crit + chi2
256
257
258 crit = sum_crit / float(len(pipes[j]))
259
260
261 else:
262
263 pipe = pipes[j]
264
265
266 switch(pipe)
267
268
269 if skip_function[pipe](model_info):
270 continue
271
272
273 k, n, chi2 = model_statistics[pipe](model_info, global_stats=global_flag)
274
275
276 if k == None or n == None or chi2 == None:
277 continue
278
279
280 crit = formula(chi2, float(k), float(n))
281
282
283 print(("%-20s %-20i %-20i %-20.5f %-20.5f" % (pipe, k, n, chi2, crit)))
284
285
286 if crit < best_crit:
287 best_model = pipe
288 best_crit = crit
289
290
291 if best_model != None:
292
293 print(("The model from the data pipe " + repr(best_model) + " has been selected."))
294
295
296 switch(best_model)
297
298
299 duplicate_data[best_model](best_model, modsel_pipe, model_info, global_stats=global_flag, verbose=False)
300
301
302 modsel_pipe_exists = True
303
304
305 else:
306
307 print("No model has been selected.")
308
309
310 if modsel_pipe_exists:
311 switch(modsel_pipe)
312
313
314 if bundle:
315 generic_fns.pipes.bundle(bundle=bundle, pipe=modsel_pipe)
316