1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """Module for selecting the best model."""
24
25
26 from math import log
27 import sys
28
29
30 import generic_fns.pipes
31 from generic_fns.pipes import get_type, has_pipe, pipe_names, switch
32 from relax_errors import RelaxError, RelaxPipeError
33 from relax_io import write_data
34 from specific_fns.setup import get_specific_fn
35
36
38 """Akaike's Information Criteria (AIC).
39
40 The formula is::
41
42 AIC = chi2 + 2k
43
44
45 @param chi2: The minimised chi-squared value.
46 @type chi2: float
47 @param k: The number of parameters in the model.
48 @type k: int
49 @param n: The dimension of the relaxation data set.
50 @type n: int
51 @return: The AIC value.
52 @rtype: float
53 """
54
55 return chi2 + 2.0*k
56
57
58 -def aicc(chi2, k, n):
59 """Small sample size corrected AIC.
60
61 The formula is::
62
63 2k(k + 1)
64 AICc = chi2 + 2k + ---------
65 n - k - 1
66
67
68 @param chi2: The minimised chi-squared value.
69 @type chi2: float
70 @param k: The number of parameters in the model.
71 @type k: int
72 @param n: The dimension of the relaxation data set.
73 @type n: int
74 @return: The AIC value.
75 @rtype: float
76 """
77
78 if n > (k+1):
79 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0)
80 elif n == (k+1):
81 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation causes a fatal division by zero as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
82 elif n < (k+1):
83 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation produces a negative, and hence nonsense, AICc score as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
84
85
87 """Bayesian or Schwarz Information Criteria.
88
89 The formula is::
90
91 BIC = chi2 + k ln n
92
93
94 @param chi2: The minimised chi-squared value.
95 @type chi2: float
96 @param k: The number of parameters in the model.
97 @type k: int
98 @param n: The dimension of the relaxation data set.
99 @type n: int
100 @return: The AIC value.
101 @rtype: float
102 """
103
104 return chi2 + k * log(n)
105
106
107 -def select(method=None, modsel_pipe=None, bundle=None, pipes=None):
108 """Model selection function.
109
110 @keyword method: The model selection method. This can currently be one of:
111 - 'AIC', Akaike's Information Criteria.
112 - 'AICc', Small sample size corrected AIC.
113 - 'BIC', Bayesian or Schwarz Information Criteria.
114 - 'CV', Single-item-out cross-validation.
115 None of the other model selection techniques are currently supported.
116 @type method: str
117 @keyword modsel_pipe: The name of the new data pipe to be created by copying of the selected data pipe.
118 @type modsel_pipe: str
119 @keyword bundle: The optional data pipe bundle to associate the newly created pipe with.
120 @type bundle: str or None
121 @keyword pipes: A list of the data pipes to use in the model selection.
122 @type pipes: list of str
123 """
124
125
126 if has_pipe(modsel_pipe):
127 raise RelaxPipeError(modsel_pipe)
128
129
130 if pipes == None:
131
132 pipes = pipe_names()
133
134
135 if method == 'AIC':
136 print("AIC model selection.")
137 formula = aic
138 elif method == 'AICc':
139 print("AICc model selection.")
140 formula = aicc
141 elif method == 'BIC':
142 print("BIC model selection.")
143 formula = bic
144 elif method == 'CV':
145 print("CV model selection.")
146 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
147 else:
148 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.")
149
150
151 if len(pipes) == 0:
152 raise RelaxError("No data pipes are available for use in model selection.")
153
154
155 function_type = {}
156 model_loop = {}
157 model_type = {}
158 duplicate_data = {}
159 model_statistics = {}
160 skip_function = {}
161 modsel_pipe_exists = False
162
163
164 if isinstance(pipes[0], list):
165
166 if len(pipes[0]) == 0:
167 raise RelaxError("No pipes are available for use in model selection in the array " + repr(pipes[0]) + ".")
168
169
170 for i in range(len(pipes)):
171 for j in range(len(pipes[i])):
172
173 model_loop[pipes[i][j]] = get_specific_fn('model_loop', get_type(pipes[i][j]))
174 model_type[pipes[i][j]] = get_specific_fn('model_type', get_type(pipes[i][j]))
175 duplicate_data[pipes[i][j]] = get_specific_fn('duplicate_data', get_type(pipes[i][j]))
176 model_statistics[pipes[i][j]] = get_specific_fn('model_stats', get_type(pipes[i][j]))
177 skip_function[pipes[i][j]] = get_specific_fn('skip_function', get_type(pipes[i][j]))
178
179
180 for i in range(len(pipes)):
181 for j in range(len(pipes[i])):
182 if model_loop[pipes[0][j]] != model_loop[pipes[i][j]]:
183 raise RelaxError("The models for each data pipes should be the same.")
184 model_loop = model_loop[pipes[0][0]]
185
186
187 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
188
189
190 global_flag = False
191 for i in range(len(pipes)):
192 for j in range(len(pipes[i])):
193 if model_type[pipes[i][j]]() == 'global':
194 global_flag = True
195
196
197 else:
198
199 for i in range(len(pipes)):
200
201 model_loop[pipes[i]] = get_specific_fn('model_loop', get_type(pipes[i]))
202 model_type[pipes[i]] = get_specific_fn('model_type', get_type(pipes[i]))
203 duplicate_data[pipes[i]] = get_specific_fn('duplicate_data', get_type(pipes[i]))
204 model_statistics[pipes[i]] = get_specific_fn('model_stats', get_type(pipes[i]))
205 skip_function[pipes[i]] = get_specific_fn('skip_function', get_type(pipes[i]))
206
207 model_loop = model_loop[pipes[0]]
208
209
210 model_desc = get_specific_fn('model_desc', get_type(pipes[0]))
211
212
213 global_flag = False
214 for j in range(len(pipes)):
215 if model_type[pipes[j]]() == 'global':
216 global_flag = True
217
218
219
220 for model_info in model_loop():
221
222 print("\n")
223 desc = model_desc(model_info)
224 if desc:
225 print(desc)
226
227
228 best_model = None
229 best_crit = 1e300
230 data = []
231
232
233 for j in range(len(pipes)):
234
235 if method == 'CV':
236
237 sum_crit = 0.0
238
239
240 for k in range(len(pipes[j])):
241
242 pipe = pipes[j][k]
243
244
245 switch(pipe)
246
247
248 if skip_function[pipe](model_info):
249 continue
250
251
252 k, n, chi2 = model_statistics[pipe](model_info)
253
254
255 if k == None or n == None or chi2 == None:
256 continue
257
258
259 sum_crit = sum_crit + chi2
260
261
262 crit = sum_crit / float(len(pipes[j]))
263
264
265 else:
266
267 pipe = pipes[j]
268
269
270 switch(pipe)
271
272
273 if skip_function[pipe](model_info):
274 continue
275
276
277 k, n, chi2 = model_statistics[pipe](model_info, global_stats=global_flag)
278
279
280 if k == None or n == None or chi2 == None:
281 continue
282
283
284 crit = formula(chi2, float(k), float(n))
285
286
287 data.append([pipe, repr(k), repr(n), "%.5f" % chi2, "%.5f" % crit])
288
289
290 if crit < best_crit:
291 best_model = pipe
292 best_crit = crit
293
294
295 write_data(out=sys.stdout, headings=["Data pipe", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion"], data=data)
296
297
298 if best_model != None:
299
300 print("The model from the data pipe " + repr(best_model) + " has been selected.")
301
302
303 switch(best_model)
304
305
306 duplicate_data[best_model](best_model, modsel_pipe, model_info, global_stats=global_flag, verbose=False)
307
308
309 modsel_pipe_exists = True
310
311
312 else:
313
314 print("No model has been selected.")
315
316
317 if modsel_pipe_exists:
318 switch(modsel_pipe)
319
320
321 if bundle:
322 generic_fns.pipes.bundle(bundle=bundle, pipe=modsel_pipe)
323