generic_fns.model

28 - def __init__(self, relax):

29 """Class containing functions specific to model selection.""" 30 31 self.relax = relax

32 33

34 - def select(self, method=None, modsel_run=None, runs=None):

35 """Model selection function.""" 36 37 # Test if the model selection run exists. 38 if not modsel_run in self.relax.data.run_names: 39 raise RelaxNoRunError, modsel_run 40 41 # The runs argument. 42 if runs == None: 43 # Use the runs from 'self.relax.data.run_names'. 44 self.runs = deepcopy(self.relax.data.run_names) 45 46 # Remove the model selection run name if it is in the list. 47 if modsel_run in self.runs: 48 self.runs.remove(modsel_run) 49 else: 50 self.runs = runs 51 52 # Select the model selection technique. 53 if method == 'AIC': 54 print "AIC model selection." 55 self.formula = self.aic 56 elif method == 'AICc': 57 print "AICc model selection." 58 self.formula = self.aicc 59 elif method == 'BIC': 60 print "BIC model selection." 61 self.formula = self.bic 62 elif method == 'CV': 63 print "CV model selection." 64 raise RelaxError, "The model selection technique " + `method` + " is not currently supported." 65 else: 66 raise RelaxError, "The model selection technique " + `method` + " is not currently supported." 67 68 # No runs. 69 if len(self.runs) == 0: 70 raise RelaxError, "No runs are availible for use in model selection." 71 72 # Initialise. 73 self.first_run = None 74 self.function_type = {} 75 self.duplicate_data = {} 76 self.count_num_instances = {} 77 self.model_statistics = {} 78 self.skip_function = {} 79 80 # Cross validation setup. 81 if type(self.runs[0]) == list: 82 # No runs. 83 if len(run) == 0: 84 raise RelaxError, "No runs are availible for use in model selection in the array " + `run` + "." 85 86 # Loop over the runs. 87 for i in xrange(len(self.runs)): 88 for j in xrange(len(self.runs[i])): 89 # The run name. 90 run = self.runs[i][j] 91 92 # Function type. 93 self.function_type[run] = self.relax.data.run_types[self.relax.data.run_names.index(run)] 94 95 # Store the first non-hybrid run. 96 if not self.first_run and self.function_type[run] != 'hybrid': 97 self.first_run = run 98 99 # Specific duplicate data, number of instances, and model statistics functions. 100 self.duplicate_data[run] = self.relax.specific_setup.setup('duplicate_data', self.function_type[run]) 101 self.count_num_instances[run] = self.relax.specific_setup.setup('num_instances', self.function_type[run]) 102 self.model_statistics[run] = self.relax.specific_setup.setup('model_stats', self.function_type[run]) 103 self.skip_function[run] = self.relax.specific_setup.setup('skip_function', self.function_type[run]) 104 105 # Run various tests. 106 self.tests(run) 107 108 # All other model selection setup. 109 else: 110 # Loop over the runs. 111 for i in xrange(len(self.runs)): 112 # The run name. 113 run = self.runs[i] 114 115 # Function type. 116 self.function_type[run] = self.relax.data.run_types[self.relax.data.run_names.index(run)] 117 118 # Store the first non-hybrid run. 119 if not self.first_run and self.function_type[run] != 'hybrid': 120 self.first_run = run 121 122 # Specific duplicate data, number of instances, and model statistics functions. 123 self.duplicate_data[run] = self.relax.specific_setup.setup('duplicate_data', self.function_type[run]) 124 self.count_num_instances[run] = self.relax.specific_setup.setup('num_instances', self.function_type[run]) 125 self.model_statistics[run] = self.relax.specific_setup.setup('model_stats', self.function_type[run]) 126 self.skip_function[run] = self.relax.specific_setup.setup('skip_function', self.function_type[run]) 127 128 # Run various tests. 129 self.tests(run) 130 131 132 # Number of instances. If the number is not the same for each run, then the minimum number will give the specific function self.model_statistics the 133 # opportunity to consolidate the instances to the minimum number if possible. 134 self.min_instances = 1e99 135 self.num_instances = [] 136 for i in xrange(len(self.runs)): 137 # An array of arrays - for cross validation model selection. 138 if type(self.runs[i]) == list: 139 self.num_instances.append([]) 140 141 # Loop over the nested array. 142 for j in xrange(len(self.runs[i])): 143 # Number of instances. 144 num = self.count_num_instances[self.runs[i][j]](self.runs[i][j]) 145 self.num_instances[i].append(num) 146 147 # Minimum. 148 if num < self.min_instances: 149 self.min_instances = num 150 151 # All other model selection techniques. 152 else: 153 # Number of instances. 154 num = self.count_num_instances[self.runs[i]](self.runs[i]) 155 self.num_instances.append(num) 156 157 # Minimum. 158 if num < self.min_instances: 159 self.min_instances = num 160 161 162 # Loop over the number of instances. 163 for i in xrange(self.min_instances): 164 # Print out. 165 print "\nInstance " + `i` + ".\n" 166 print "%-20s %-20s %-20s %-20s %-20s" % ("Run", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion") 167 168 # Initial model. 169 best_model = None 170 best_crit = 1e300 171 172 # Loop over the runs. 173 for j in xrange(len(self.runs)): 174 # Single-item-out cross validation. 175 if method == 'CV': 176 # Sum of chi-squared values. 177 sum_crit = 0.0 178 179 # Loop over the validation samples and sum the chi-squared values. 180 for k in xrange(len(self.runs[j])): 181 # Reassign the run. 182 run = self.runs[j][k] 183 184 # Skip function. 185 if self.skip_function[run](run=run, instance=i): 186 continue 187 188 # Get the model statistics. 189 k, n, chi2 = self.model_statistics[run](run=run, instance=i, min_instances=self.min_instances) 190 191 # Missing data sets. 192 if k == None or n == None or chi2 == None: 193 continue 194 195 # Chi2 sum. 196 sum_crit = sum_crit + chi2 197 198 # Cross-validation criterion (average chi-squared value). 199 crit = sum_crit / float(len(self.runs[j])) 200 201 # Other model selection methods. 202 else: 203 # Reassign the run. 204 run = self.runs[j] 205 206 # Skip function. 207 if self.skip_function[run](run=run, instance=i, min_instances=self.min_instances, num_instances=self.num_instances[j]): 208 continue 209 210 # Global stats. 211 if self.num_instances[j] > self.min_instances or self.num_instances[j] == 1: 212 global_stats = 1 213 else: 214 global_stats = 0 215 216 # Get the model statistics. 217 k, n, chi2 = self.model_statistics[run](run=run, instance=i, global_stats=global_stats) 218 219 # Missing data sets. 220 if k == None or n == None or chi2 == None: 221 continue 222 223 # Calculate the criterion value. 224 crit = self.formula(chi2, float(k), float(n)) 225 226 # Print out. 227 print "%-20s %-20i %-20i %-20.5f %-20.5f" % (run, k, n, chi2, crit) 228 229 # Select model. 230 if crit < best_crit: 231 best_model = run 232 best_crit = crit 233 234 # Print out of selected model. 235 print "\nThe model from the run " + `best_model` + " has been selected." 236 237 # Duplicate the data from the 'best_model' to the model selection run 'modsel_run'. 238 if best_model != None: 239 self.duplicate_data[best_model](new_run=modsel_run, old_run=best_model, instance=i, global_stats=global_stats)

240 241

242 - def aic(self, chi2, k, n):

243 """Akaike's Information Criteria (AIC). 244 245 The formula is: 246 247 AIC = chi2 + 2k 248 249 where: 250 chi2 is the minimised chi-squared value. 251 k is the number of parameters in the model. 252 """ 253 254 return chi2 + 2.0*k

255 256

257 - def aicc(self, chi2, k, n):

258 """Small sample size corrected AIC. 259 260 The formula is: 261 262 2k(k + 1) 263 AICc = chi2 + 2k + --------- 264 n - k - 1 265 266 where: 267 chi2 is the minimised chi-squared value. 268 k is the number of parameters in the model. 269 n is the dimension of the relaxation data set. 270 """ 271 272 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0)

273 274

275 - def bic(self, chi2, k, n):

276 """Bayesian or Schwarz Information Criteria. 277 278 The formula is: 279 280 BIC = chi2 + k ln n 281 282 where: 283 chi2 - is the minimised chi-squared value. 284 k - is the number of parameters in the model. 285 n is the dimension of the relaxation data set. 286 """ 287 288 return chi2 + k * log(n)

289 290

291 - def tests(self, run):

292 """Function containing tests the given run.""" 293 294 # Test if the run exists. 295 if not run in self.relax.data.run_names: 296 raise RelaxNoRunError, run 297 298 # Find the index of the run. 299 index = self.relax.data.run_names.index(run) 300 301 # Test if the function type is the same as 'self.function_type' (skip the test if self.function_type is a hybrid). 302 #if self.function_type != 'hybrid' and self.relax.data.run_types[index] != self.function_type: 303 if self.relax.data.run_types[index] != self.function_type[run]: 304 raise RelaxError, "The runs supplied are not all of the same function type." 305 306 # Skip all tests if the model is a hybrid. 307 if self.relax.data.run_types[index] == 'hybrid': 308 return 309 310 # Test if sequence data is loaded. 311 if not self.relax.data.res.has_key(run): 312 raise RelaxNoSequenceError, run 313 314 # Sequence lengths. 315 if len(self.relax.data.res[self.first_run]) != len(self.relax.data.res[run]): 316 raise RelaxDiffSeqError, (self.first_run, run) 317 318 # Loop over the sequence and test that the residue number and residue name are the same. 319 for i in xrange(len(self.relax.data.res[self.first_run])): 320 # Residue number. 321 if self.relax.data.res[self.first_run][i].num != self.relax.data.res[run][i].num: 322 raise RelaxDiffSeqError, (self.first_run, run) 323 324 # Residue name. 325 if self.relax.data.res[self.first_run][i].name != self.relax.data.res[run][i].name: 326 raise RelaxDiffSeqError, (self.first_run, run)

Source Code for Module generic_fns.model_selection