generic_fns.model

28 - def __init__(self, relax):

29 """Class containing functions specific to model selection.""" 30 31 self.relax = relax

32 33

34 - def select(self, method=None, modsel_run=None, runs=None):

35 """Model selection function.""" 36 37 # Test if the model selection run exists. 38 if not modsel_run in self.relax.data.run_names: 39 raise RelaxNoRunError, modsel_run 40 41 # The runs argument. 42 if runs == None: 43 # Use the runs from 'self.relax.data.run_names'. 44 self.runs = deepcopy(self.relax.data.run_names) 45 46 # Remove the model selection run name if it is in the list. 47 if modsel_run in self.runs: 48 self.runs.remove(modsel_run) 49 else: 50 self.runs = runs 51 52 # Select the model selection technique. 53 if method == 'AIC': 54 print "AIC model selection." 55 self.formula = self.aic 56 elif method == 'AICc': 57 print "AICc model selection." 58 self.formula = self.aicc 59 elif method == 'BIC': 60 print "BIC model selection." 61 self.formula = self.bic 62 elif method == 'CV': 63 print "CV model selection." 64 raise RelaxError, "The model selection technique " + `method` + " is not currently supported." 65 else: 66 raise RelaxError, "The model selection technique " + `method` + " is not currently supported." 67 68 # No runs. 69 if len(self.runs) == 0: 70 raise RelaxError, "No runs are availible for use in model selection." 71 72 # Store the first run. 73 if type(self.runs[0]) == list: 74 self.first_run = self.runs[0][0] 75 else: 76 self.first_run = self.runs[0] 77 78 # Function type. 79 self.function_type = self.relax.data.run_types[self.relax.data.run_names.index(self.first_run)] 80 81 # Specific duplicate data, number of instances, and model statistics functions. 82 self.duplicate_data = self.relax.specific_setup.setup('duplicate_data', self.function_type) 83 self.count_num_instances = self.relax.specific_setup.setup('num_instances', self.function_type) 84 self.model_statistics = self.relax.specific_setup.setup('model_stats', self.function_type) 85 self.skip_function = self.relax.specific_setup.setup('skip_function', self.function_type) 86 87 # Number of instances. If the number is not the same for each run, then the minimum number will give the specific function self.model_statistics the 88 # opportunity to consolidate the instances to the minimum number if possible. 89 self.min_instances = 1e99 90 self.num_instances = [] 91 for i in xrange(len(self.runs)): 92 # An array of arrays - for cross validation model selection. 93 if type(self.runs[i]) == list: 94 self.num_instances.append([]) 95 96 # Loop over the nested array. 97 for j in xrange(len(self.runs[i])): 98 # Number of instances. 99 num = self.count_num_instances(self.runs[i][j]) 100 self.num_instances[i].append(num) 101 102 # Minimum. 103 if num < self.min_instances: 104 self.min_instances = num 105 106 else: 107 # Number of instances. 108 num = self.count_num_instances(self.runs[i]) 109 self.num_instances.append(num) 110 111 # Minimum. 112 if num < self.min_instances: 113 self.min_instances = num 114 115 # Tests for all runs. 116 for run in self.runs: 117 # An array of arrays - for cross validation model selection. 118 if type(run) == list: 119 # No runs. 120 if len(run) == 0: 121 raise RelaxError, "No runs are availible for use in model selection in the array " + `run` + "." 122 123 # Loop over the nested array. 124 for run2 in run: 125 # Run various tests. 126 self.tests(run2) 127 128 # runs is a normal array. 129 else: 130 # Run various tests. 131 self.tests(run) 132 133 134 # Loop over the number of instances. 135 for i in xrange(self.min_instances): 136 # Print out. 137 print "\nInstance " + `i` + ".\n" 138 print "%-20s %-20s %-20s %-20s %-20s" % ("Run", "Num_params_(k)", "Num_data_sets_(n)", "Chi2", "Criterion") 139 140 # Initial model. 141 best_model = None 142 best_crit = float('inf') 143 144 # Loop over the runs. 145 for j in xrange(len(self.runs)): 146 # Single-item-out cross validation. 147 if method == 'CV': 148 # Reassign the run. 149 run = self.runs[j][0] 150 151 # Sum of chi-squared values. 152 sum_crit = 0.0 153 154 # Loop over the validation samples and sum the chi-squared values. 155 for k in xrange(len(self.runs[j])): 156 # Skip function. 157 if self.skip_function(run=self.runs[j][k], instance=i): 158 continue 159 160 # Get the model statistics. 161 k, n, chi2 = self.model_statistics(run=self.runs[j][k], instance=i, min_instances=self.min_instances, num_instances=self.num_instances[j][k]) 162 163 # Missing data sets. 164 if k == None or n == None or chi2 == None: 165 continue 166 167 # Chi2 sum. 168 sum_crit = sum_crit + chi2 169 170 # Cross-validation criterion (average chi-squared value). 171 crit = sum_crit / float(len(self.runs[j])) 172 173 # Other model selection methods. 174 else: 175 # Reassign the run. 176 run = self.runs[j] 177 178 # Skip function. 179 if self.skip_function(run=run, instance=i, min_instances=self.min_instances, num_instances=self.num_instances[j]): 180 continue 181 182 # Get the model statistics. 183 k, n, chi2 = self.model_statistics(run=run, instance=i, min_instances=self.min_instances, num_instances=self.num_instances[j]) 184 185 # Missing data sets. 186 if k == None or n == None or chi2 == None: 187 continue 188 189 # Calculate the criterion value. 190 crit = self.formula(chi2, float(k), float(n)) 191 192 # Print out. 193 print "%-20s %-20i %-20i %-20.5f %-20.5f" % (run, k, n, chi2, crit) 194 195 # Select model. 196 if crit < best_crit: 197 best_model = run 198 best_crit = crit 199 200 # Print out of selected model. 201 print "\nThe model from the run " + `best_model` + " has been selected." 202 203 # Duplicate the data from the 'best_model' to the model selection run 'modsel_run'. 204 if best_model != None: 205 self.duplicate_data(new_run=modsel_run, old_run=best_model, instance=i)

206 207

208 - def aic(self, chi2, k, n):

209 """Akaike's Information Criteria (AIC). 210 211 The formula is: 212 213 AIC = chi2 + 2k 214 215 where: 216 chi2 is the minimised chi-squared value. 217 k is the number of parameters in the model. 218 """ 219 220 return chi2 + 2.0*k

221 222

223 - def aicc(self, chi2, k, n):

224 """Small sample size corrected AIC. 225 226 The formula is: 227 228 2k(k + 1) 229 AICc = chi2 + 2k + --------- 230 n - k - 1 231 232 where: 233 chi2 is the minimised chi-squared value. 234 k is the number of parameters in the model. 235 n is the dimension of the relaxation data set. 236 """ 237 238 return chi2 + 2.0*k + 2.0*k*(k + 1.0) / (n - k - 1.0)

239 240

241 - def bic(self, chi2, k, n):

242 """Bayesian or Schwarz Information Criteria. 243 244 The formula is: 245 246 BIC = chi2 + k ln n 247 248 where: 249 chi2 - is the minimised chi-squared value. 250 k - is the number of parameters in the model. 251 n is the dimension of the relaxation data set. 252 """ 253 254 return chi2 + k * log(n)

255 256

257 - def tests(self, run):

258 """Function containing tests the given run.""" 259 260 # Test if the run exists. 261 if not run in self.relax.data.run_names: 262 raise RelaxNoRunError, run 263 264 # Find the index of the run. 265 index = self.relax.data.run_names.index(run) 266 267 # Test if the function type is the same as 'self.function_type'. 268 if self.relax.data.run_types[index] != self.function_type: 269 raise RelaxError, "The runs supplied are not all of the same function type." 270 271 # Test if sequence data is loaded. 272 if not self.relax.data.res.has_key(run): 273 raise RelaxNoSequenceError, run 274 275 # Sequence lengths. 276 if len(self.relax.data.res[self.first_run]) != len(self.relax.data.res[run]): 277 raise RelaxDiffSeqError, (self.first_run, run) 278 279 # Loop over the sequence and test that the residue number and residue name are the same. 280 for i in xrange(len(self.relax.data.res[self.first_run])): 281 # Residue number. 282 if self.relax.data.res[self.first_run][i].num != self.relax.data.res[run][i].num: 283 raise RelaxDiffSeqError, (self.first_run, run) 284 285 # Residue name. 286 if self.relax.data.res[self.first_run][i].name != self.relax.data.res[run][i].name: 287 raise RelaxDiffSeqError, (self.first_run, run)

Source Code for Module generic_fns.model_selection