1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """Module for selecting the best model."""
24
25
26 from math import log
27
28
29 import generic_fns.pipes
30 from generic_fns.pipes import get_type, has_pipe, pipe_names, switch
31 from relax_errors import RelaxError, RelaxPipeError
32 from specific_fns.setup import get_specific_fn
33
34
36 """Akaike's Information Criteria (AIC).
37
38 The formula is::
39
40 AIC = chi2 + 2k
41
42
43 @param chi2: The minimised chi-squared value.
44 @type chi2: float
45 @param k: The number of parameters in the model.
46 @type k: int
47 @param n: The dimension of the relaxation data set.
48 @type n: int
49 @return: The AIC value.
50 @rtype: float
51 """
52
53 return chi2 + 2.0*k
54
55
56 -def aicc(chi2, k, n):
57 """Small sample size corrected AIC.
58
59 The formula is::
60
61 2k(k + 1)
62 AICc = chi2 + 2k + ---------
63 n - k - 1
64
65
66 @param chi2: The minimised chi-squared value.
67 @type chi2: float
68 @param k: The number of parameters in the model.
69 @type k: int
70 @param n: The dimension of the relaxation data set.
71 @type n: int
72 @return: The AIC value.
73 @rtype: float
74 """
75
76 if n > (k+1):
77 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0)
78 elif n == (k+1):
79 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation causes a fatal division by zero as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
80 elif n < (k+1):
81 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation produces a negative, and hence nonsense, AICc score as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
82
83
85 """Bayesian or Schwarz Information Criteria.
86
87 The formula is::
88
89 BIC = chi2 + k ln n
90
91
92 @param chi2: The minimised chi-squared value.
93 @type chi2: float
94 @param k: The number of parameters in the model.
95 @type k: int
96 @param n: The dimension of the relaxation data set.
97 @type n: int
98 @return: The AIC value.
99 @rtype: float
100 """
101
102 return chi2 + k * log(n)
103
104
105 -def select(method=None, modsel_pipe=None, bundle=None, pipes=None):
106 """Model selection function.
107
108 @keyword method: The model selection method. This can currently be one of:
109 - 'AIC', Akaike's Information Criteria.
110 - 'AICc', Small sample size corrected AIC.
111 - 'BIC', Bayesian or Schwarz Information Criteria.
112 - 'CV', Single-item-out cross-validation.
113 None of the other model selection techniques are currently supported.
114 @type method: str
115 @keyword modsel_pipe: The name of the new data pipe to be created by copying of the selected data pipe.
116 @type modsel_pipe: str
117 @keyword bundle: The optional data pipe bundle to associate the newly created pipe with.
118 @type bundle: str or None
119 @keyword pipes: A list of the data pipes to use in the model selection.
120 @type pipes: list of str
121 """
122
123
124 if has_pipe(modsel_pipe):
125 raise RelaxPipeError(modsel_pipe)
126
127
128 if pipes == None:
129
130 pipes = pipe_names()
131
132
133 if method == 'AIC':
134 print("AIC model selection.")
135 formula = aic
136 elif method == 'AICc':
137 print("AICc model selection.")
138 formula = aicc
139 elif method == 'BIC':
140 print("BIC model selection.")
141 formula = bic
142 elif method == 'CV':
143 print("CV model selection.")
144 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
145 else:
146 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
147
148
149 if len(pipes) == 0:
150 raise RelaxError("No data pipes are available for use in model selection.")
151
152
153 function_type = {}
154 model_loop = {}
155 model_type = {}
156 duplicate_data = {}
157 model_statistics = {}
158 skip_function = {}
159 modsel_pipe_exists = False
160
161
162 if isinstance(pipes[0], list):
163
164 if len(pipes[0]) == 0:
165 raise RelaxError("No pipes are available for use in model selection in the array " + repr(pipes[0]) + ".")
166
167
168 for i in range(len(pipes)):
169 for j in range(len(pipes[i])):
170
171 model_loop[pipes[i][j]] = get_specific_fn('model_loop', get_type(pipes[i][j]))
172 model_type[pipes[i][j]] = get_specific_fn('model_type', get_type(pipes[i][j]))
173 duplicate_data[pipes[i][j]] = get_specific_fn('duplicate_data', get_type(pipes[i][j]))
174 model_statistics[pipes[i][j]] = get_specific_fn('model_stats', get_type(pipes[i][j]))
175 skip_function[pipes[i][j]] = get_specific_fn('skip_function', get_type(pipes[i][j]))
176
177
178 for i in range(len(pipes)):
179 for j in range(len(pipes[i])):
180 if model_loop[pipes[0][j]] != model_loop[pipes[i][j]]:
181 raise RelaxError("The models for each data pipes should be the same.")
182 model_loop = model_loop[pipes[0][0]]
183
184
185 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
186
187
188 global_flag = False
189 for i in range(len(pipes)):
190 for j in range(len(pipes[i])):
191 if model_type[pipes[i][j]]() == 'global':
192 global_flag = True
193
194
195 else:
196
197 for i in range(len(pipes)):
198
199 model_loop[pipes[i]] = get_specific_fn('model_loop', get_type(pipes[i]))
200 model_type[pipes[i]] = get_specific_fn('model_type', get_type(pipes[i]))
201 duplicate_data[pipes[i]] = get_specific_fn('duplicate_data', get_type(pipes[i]))
202 model_statistics[pipes[i]] = get_specific_fn('model_stats', get_type(pipes[i]))
203 skip_function[pipes[i]] = get_specific_fn('skip_function', get_type(pipes[i]))
204
205 model_loop = model_loop[pipes[0]]
206
207
208 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
209
210
211 global_flag = False
212 for j in range(len(pipes)):
213 if model_type[pipes[j]]() == 'global':
214 global_flag = True
215
216
217
218 for model_info in model_loop():
219
220 print("\n" + model_desc(model_info))
221 print("%-20s %-20s %-20s %-20s %-20s" % ("Data pipe", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion"))
222
223
224 best_model = None
225 best_crit = 1e300
226
227
228 for j in range(len(pipes)):
229
230 if method == 'CV':
231
232 sum_crit = 0.0
233
234
235 for k in range(len(pipes[j])):
236
237 pipe = pipes[j][k]
238
239
240 switch(pipe)
241
242
243 if skip_function[pipe](model_info):
244 continue
245
246
247 k, n, chi2 = model_statistics[pipe](model_info)
248
249
250 if k == None or n == None or chi2 == None:
251 continue
252
253
254 sum_crit = sum_crit + chi2
255
256
257 crit = sum_crit / float(len(pipes[j]))
258
259
260 else:
261
262 pipe = pipes[j]
263
264
265 switch(pipe)
266
267
268 if skip_function[pipe](model_info):
269 continue
270
271
272 k, n, chi2 = model_statistics[pipe](model_info, global_stats=global_flag)
273
274
275 if k == None or n == None or chi2 == None:
276 continue
277
278
279 crit = formula(chi2, float(k), float(n))
280
281
282 print("%-20s %-20i %-20i %-20.5f %-20.5f" % (pipe, k, n, chi2, crit))
283
284
285 if crit < best_crit:
286 best_model = pipe
287 best_crit = crit
288
289
290 if best_model != None:
291
292 print("The model from the data pipe " + repr(best_model) + " has been selected.")
293
294
295 switch(best_model)
296
297
298 duplicate_data[best_model](best_model, modsel_pipe, model_info, global_stats=global_flag, verbose=False)
299
300
301 modsel_pipe_exists = True
302
303
304 else:
305
306 print("No model has been selected.")
307
308
309 if modsel_pipe_exists:
310 switch(modsel_pipe)
311
312
313 if bundle:
314 generic_fns.pipes.bundle(bundle=bundle, pipe=modsel_pipe)
315