Package generic_fns :: Module model_selection
[hide private]
[frames] | no frames]

Source Code for Module generic_fns.model_selection

  1  ############################################################################### 
  2  #                                                                             # 
  3  # Copyright (C) 2003-2012 Edward d'Auvergne                                   # 
  4  #                                                                             # 
  5  # This file is part of the program relax (http://www.nmr-relax.com).          # 
  6  #                                                                             # 
  7  # This program is free software: you can redistribute it and/or modify        # 
  8  # it under the terms of the GNU General Public License as published by        # 
  9  # the Free Software Foundation, either version 3 of the License, or           # 
 10  # (at your option) any later version.                                         # 
 11  #                                                                             # 
 12  # This program is distributed in the hope that it will be useful,             # 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of              # 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               # 
 15  # GNU General Public License for more details.                                # 
 16  #                                                                             # 
 17  # You should have received a copy of the GNU General Public License           # 
 18  # along with this program.  If not, see <http://www.gnu.org/licenses/>.       # 
 19  #                                                                             # 
 20  ############################################################################### 
 21   
 22  # Module docstring. 
 23  """Module for selecting the best model.""" 
 24   
 25  # Python module imports. 
 26  from math import log 
 27  import sys 
 28   
 29  # relax module imports. 
 30  import generic_fns.pipes 
 31  from generic_fns.pipes import get_type, has_pipe, pipe_names, switch 
 32  from relax_errors import RelaxError, RelaxPipeError 
 33  from relax_io import write_data 
 34  from specific_fns.setup import get_specific_fn 
 35   
 36   
37 -def aic(chi2, k, n):
38 """Akaike's Information Criteria (AIC). 39 40 The formula is:: 41 42 AIC = chi2 + 2k 43 44 45 @param chi2: The minimised chi-squared value. 46 @type chi2: float 47 @param k: The number of parameters in the model. 48 @type k: int 49 @param n: The dimension of the relaxation data set. 50 @type n: int 51 @return: The AIC value. 52 @rtype: float 53 """ 54 55 return chi2 + 2.0*k
56 57
58 -def aicc(chi2, k, n):
59 """Small sample size corrected AIC. 60 61 The formula is:: 62 63 2k(k + 1) 64 AICc = chi2 + 2k + --------- 65 n - k - 1 66 67 68 @param chi2: The minimised chi-squared value. 69 @type chi2: float 70 @param k: The number of parameters in the model. 71 @type k: int 72 @param n: The dimension of the relaxation data set. 73 @type n: int 74 @return: The AIC value. 75 @rtype: float 76 """ 77 78 if n > (k+1): 79 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0) 80 elif n == (k+1): 81 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation causes a fatal division by zero as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k)) 82 elif n < (k+1): 83 raise RelaxError("The size of the dataset, n=%s, is too small for this model of size k=%s. This situation produces a negative, and hence nonsense, AICc score as:\n AICc = chi2 + 2k + 2k*(k + 1) / (n - k - 1).\n\nPlease use AIC model selection instead." % (n, k))
84 85
86 -def bic(chi2, k, n):
87 """Bayesian or Schwarz Information Criteria. 88 89 The formula is:: 90 91 BIC = chi2 + k ln n 92 93 94 @param chi2: The minimised chi-squared value. 95 @type chi2: float 96 @param k: The number of parameters in the model. 97 @type k: int 98 @param n: The dimension of the relaxation data set. 99 @type n: int 100 @return: The AIC value. 101 @rtype: float 102 """ 103 104 return chi2 + k * log(n)
105 106
107 -def select(method=None, modsel_pipe=None, bundle=None, pipes=None):
108 """Model selection function. 109 110 @keyword method: The model selection method. This can currently be one of: 111 - 'AIC', Akaike's Information Criteria. 112 - 'AICc', Small sample size corrected AIC. 113 - 'BIC', Bayesian or Schwarz Information Criteria. 114 - 'CV', Single-item-out cross-validation. 115 None of the other model selection techniques are currently supported. 116 @type method: str 117 @keyword modsel_pipe: The name of the new data pipe to be created by copying of the selected data pipe. 118 @type modsel_pipe: str 119 @keyword bundle: The optional data pipe bundle to associate the newly created pipe with. 120 @type bundle: str or None 121 @keyword pipes: A list of the data pipes to use in the model selection. 122 @type pipes: list of str 123 """ 124 125 # Test if the pipe already exists. 126 if has_pipe(modsel_pipe): 127 raise RelaxPipeError(modsel_pipe) 128 129 # Use all pipes. 130 if pipes == None: 131 # Get all data pipe names from the relax data store. 132 pipes = pipe_names() 133 134 # Select the model selection technique. 135 if method == 'AIC': 136 print("AIC model selection.") 137 formula = aic 138 elif method == 'AICc': 139 print("AICc model selection.") 140 formula = aicc 141 elif method == 'BIC': 142 print("BIC model selection.") 143 formula = bic 144 elif method == 'CV': 145 print("CV model selection.") 146 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.") 147 else: 148 raise RelaxError("The model selection technique " + repr(method) + " is not currently supported.") 149 150 # No pipes. 151 if len(pipes) == 0: 152 raise RelaxError("No data pipes are available for use in model selection.") 153 154 # Initialise. 155 function_type = {} 156 model_loop = {} 157 model_type = {} 158 duplicate_data = {} 159 model_statistics = {} 160 skip_function = {} 161 modsel_pipe_exists = False 162 163 # Cross validation setup. 164 if isinstance(pipes[0], list): 165 # No pipes. 166 if len(pipes[0]) == 0: 167 raise RelaxError("No pipes are available for use in model selection in the array " + repr(pipes[0]) + ".") 168 169 # Loop over the data pipes. 170 for i in range(len(pipes)): 171 for j in range(len(pipes[i])): 172 # Specific functions. 173 model_loop[pipes[i][j]] = get_specific_fn('model_loop', get_type(pipes[i][j])) 174 model_type[pipes[i][j]] = get_specific_fn('model_type', get_type(pipes[i][j])) 175 duplicate_data[pipes[i][j]] = get_specific_fn('duplicate_data', get_type(pipes[i][j])) 176 model_statistics[pipes[i][j]] = get_specific_fn('model_stats', get_type(pipes[i][j])) 177 skip_function[pipes[i][j]] = get_specific_fn('skip_function', get_type(pipes[i][j])) 178 179 # The model loop should be the same for all data pipes! 180 for i in range(len(pipes)): 181 for j in range(len(pipes[i])): 182 if model_loop[pipes[0][j]] != model_loop[pipes[i][j]]: 183 raise RelaxError("The models for each data pipes should be the same.") 184 model_loop = model_loop[pipes[0][0]] 185 186 # The model description. 187 model_desc = get_specific_fn('model_desc', get_type(pipes[0])) 188 189 # Global vs. local models. 190 global_flag = False 191 for i in range(len(pipes)): 192 for j in range(len(pipes[i])): 193 if model_type[pipes[i][j]]() == 'global': 194 global_flag = True 195 196 # All other model selection setup. 197 else: 198 # Loop over the data pipes. 199 for i in range(len(pipes)): 200 # Specific functions. 201 model_loop[pipes[i]] = get_specific_fn('model_loop', get_type(pipes[i])) 202 model_type[pipes[i]] = get_specific_fn('model_type', get_type(pipes[i])) 203 duplicate_data[pipes[i]] = get_specific_fn('duplicate_data', get_type(pipes[i])) 204 model_statistics[pipes[i]] = get_specific_fn('model_stats', get_type(pipes[i])) 205 skip_function[pipes[i]] = get_specific_fn('skip_function', get_type(pipes[i])) 206 207 model_loop = model_loop[pipes[0]] 208 209 # The model description. 210 model_desc = get_specific_fn('model_desc', get_type(pipes[0])) 211 212 # Global vs. local models. 213 global_flag = False 214 for j in range(len(pipes)): 215 if model_type[pipes[j]]() == 'global': 216 global_flag = True 217 218 219 # Loop over the base models. 220 for model_info in model_loop(): 221 # Print out. 222 print("\n") 223 desc = model_desc(model_info) 224 if desc: 225 print(desc) 226 227 # Initial model. 228 best_model = None 229 best_crit = 1e300 230 data = [] 231 232 # Loop over the pipes. 233 for j in range(len(pipes)): 234 # Single-item-out cross validation. 235 if method == 'CV': 236 # Sum of chi-squared values. 237 sum_crit = 0.0 238 239 # Loop over the validation samples and sum the chi-squared values. 240 for k in range(len(pipes[j])): 241 # Alias the data pipe name. 242 pipe = pipes[j][k] 243 244 # Switch to this pipe. 245 switch(pipe) 246 247 # Skip function. 248 if skip_function[pipe](model_info): 249 continue 250 251 # Get the model statistics. 252 k, n, chi2 = model_statistics[pipe](model_info) 253 254 # Missing data sets. 255 if k == None or n == None or chi2 == None: 256 continue 257 258 # Chi2 sum. 259 sum_crit = sum_crit + chi2 260 261 # Cross-validation criterion (average chi-squared value). 262 crit = sum_crit / float(len(pipes[j])) 263 264 # Other model selection methods. 265 else: 266 # Reassign the pipe. 267 pipe = pipes[j] 268 269 # Switch to this pipe. 270 switch(pipe) 271 272 # Skip function. 273 if skip_function[pipe](model_info): 274 continue 275 276 # Get the model statistics. 277 k, n, chi2 = model_statistics[pipe](model_info, global_stats=global_flag) 278 279 # Missing data sets. 280 if k == None or n == None or chi2 == None: 281 continue 282 283 # Calculate the criterion value. 284 crit = formula(chi2, float(k), float(n)) 285 286 # Store the values for a later printout. 287 data.append([pipe, repr(k), repr(n), "%.5f" % chi2, "%.5f" % crit]) 288 289 # Select model. 290 if crit < best_crit: 291 best_model = pipe 292 best_crit = crit 293 294 # Write out the table. 295 write_data(out=sys.stdout, headings=["Data pipe", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion"], data=data) 296 297 # Duplicate the data from the 'best_model' to the model selection data pipe. 298 if best_model != None: 299 # Print out of selected model. 300 print("The model from the data pipe " + repr(best_model) + " has been selected.") 301 302 # Switch to the selected data pipe. 303 switch(best_model) 304 305 # Duplicate. 306 duplicate_data[best_model](best_model, modsel_pipe, model_info, global_stats=global_flag, verbose=False) 307 308 # Model selection pipe now exists. 309 modsel_pipe_exists = True 310 311 # No model selected. 312 else: 313 # Print out of selected model. 314 print("No model has been selected.") 315 316 # Switch to the model selection pipe. 317 if modsel_pipe_exists: 318 switch(modsel_pipe) 319 320 # Bundle the data pipe. 321 if bundle: 322 generic_fns.pipes.bundle(bundle=bundle, pipe=modsel_pipe)
323