lib.statistics

1 ############################################################################### 2 # # 3 # Copyright (C) 2013,2015 Edward d'Auvergne # 4 # Copyright (C) 2014 Troels E. Linnet # 5 # # 6 # This file is part of the program relax (http://www.nmr-relax.com). # 7 # # 8 # This program is free software: you can redistribute it and/or modify # 9 # it under the terms of the GNU General Public License as published by # 10 # the Free Software Foundation, either version 3 of the License, or # 11 # (at your option) any later version. # 12 # # 13 # This program is distributed in the hope that it will be useful, # 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 16 # GNU General Public License for more details. # 17 # # 18 # You should have received a copy of the GNU General Public License # 19 # along with this program. If not, see <http://www.gnu.org/licenses/>. # 20 # # 21 ############################################################################### 22 23 # Module docstring. 24 """Module for calculating simple statistics.""" 25 26 # Python module imports. 27 from math import exp 28 from numpy import absolute, array, diag, dot, eye, float64, log, multiply, transpose 29 from numpy.linalg import inv, qr 30 31 # Python module imports. 32 from math import exp, pi, sqrt 33 34

35 -def bucket(values=None, lower=0.0, upper=200.0, inc=100, verbose=False):

36 """Generate a discrete probability distribution for the given values. 37 38 @keyword values: The list of values to convert. 39 @type values: list of float 40 @keyword lower: The lower bound of the distribution. 41 @type lower: float 42 @keyword upper: The upper bound of the distribution. 43 @type upper: float 44 @keyword inc: The number of discrete increments for the distribution between the lower and upper bounds. 45 @type inc: int 46 @keyword verbose: A flag which if True will enable printouts. 47 @type verbose: bool 48 @return: The discrete probability distribution. 49 @rtype: list of lists of float 50 """ 51 52 # The bin width. 53 bin_width = (upper - lower)/float(inc) 54 55 # Init the dist object. 56 dist = [] 57 for i in range(inc): 58 dist.append([bin_width*i+lower, 0]) 59 60 # Loop over the values. 61 for val in values: 62 # The bin. 63 bin = int((val - lower)/bin_width) 64 65 # Outside of the limits. 66 if bin < 0 or bin >= inc: 67 if verbose: 68 print("Outside of the limits: '%s'" % val) 69 continue 70 71 # Increment the count. 72 dist[bin][1] = dist[bin][1] + 1 73 74 # Convert the counts to frequencies. 75 total_pr = 0.0 76 for i in range(inc): 77 dist[i][1] = dist[i][1] / float(len(values)) 78 total_pr = total_pr + dist[i][1] 79 80 # Printout. 81 if verbose: 82 print("Total Pr: %s" % total_pr) 83 84 # Return the dist. 85 return dist

86 87

88 -def gaussian(x=None, mu=0.0, sigma=1.0):

89 """Calculate the probability for a Gaussian probability distribution for a given x value. 90 91 @keyword x: The x value to calculate the probability for. 92 @type x: float 93 @keyword mu: The mean of the distribution. 94 @type mu: float 95 @keyword sigma: The standard deviation of the distribution. 96 @type sigma: float 97 @return: The probability corresponding to x. 98 @rtype: float 99 """ 100 101 # Calculate and return the probability. 102 return exp(-(x-mu)**2 / (2.0*sigma**2)) / (sigma * sqrt(2.0 * pi))

103 104

105 -def geometric_mean(values=None):

106 """Calculate the geometric mean for the given values. 107 108 @keyword values: The list of values to calculate the geometric mean of. 109 @type values: list of float 110 @return: The geometric mean. 111 @rtype: float 112 """ 113 114 # Init. 115 n = len(values) 116 values = array(values, float64) 117 118 # The log of all values. 119 log_vals = log(values) 120 121 # Calculate the sum of the log values for all points. 122 Xsum = log_vals.sum() 123 124 # Average. 125 Xav = Xsum / float(n) 126 127 # Return the geometric mean. 128 return exp(Xav)

129 130

131 -def geometric_std(values=None, mean=None):

132 """Calculate the geometric standard deviation for the given values. 133 134 @keyword values: The list of values to calculate the geometric mean of. 135 @type values: list of float 136 @keyword mean: The pre-calculated geometric mean. If not supplied, the value will be calculated. 137 @type mean: float 138 @return: The geometric mean. 139 @rtype: float 140 """ 141 142 # Init. 143 n = len(values) 144 values = array(values, float64) 145 146 # The geometric mean. 147 if mean == None: 148 mean = geometric_mean(values=values) 149 150 # Divide the values by the geometric mean, take the log, and square everything. 151 factor = (log(values / mean))**2 152 153 # The square root of the sum divided by n. 154 factor2 = sqrt(factor.sum() / n) 155 156 # Return the geometric std. 157 return exp(factor2)

158 159

160 -def std(values=None, skip=None, dof=1):

161 """Calculate the standard deviation of the given values, skipping values if asked. 162 163 @keyword values: The list of values to calculate the standard deviation of. 164 @type values: list of float 165 @keyword skip: An optional list of booleans specifying if a value should be skipped. The length of this list must match the values. An element of True will cause the corresponding value to not be included in the calculation. 166 @type skip: list of bool or None. 167 @keyword dof: The degrees of freedom, whereby the standard deviation is multipled by 1/(N - dof). 168 @type dof: int 169 @return: The standard deviation. 170 @rtype: float 171 """ 172 173 # The total number of points. 174 n = 0 175 for i in range(len(values)): 176 # Skip deselected values. 177 if skip != None and not skip[i]: 178 continue 179 180 # Increment n. 181 n = n + 1 182 183 # Calculate the sum of the values for all points. 184 Xsum = 0.0 185 for i in range(len(values)): 186 # Skip deselected values. 187 if skip != None and not skip[i]: 188 continue 189 190 # Sum. 191 Xsum = Xsum + values[i] 192 193 # Calculate the mean value for all points. 194 if n == 0: 195 Xav = 0.0 196 else: 197 Xav = Xsum / float(n) 198 199 # Calculate the sum part of the standard deviation. 200 sd = 0.0 201 for i in range(len(values)): 202 # Skip deselected values. 203 if skip != None and not skip[i]: 204 continue 205 206 # Sum. 207 sd = sd + (values[i] - Xav)**2 208 209 # Calculate the standard deviation. 210 if n <= 1: 211 sd = 0.0 212 else: 213 sd = sqrt(sd / (float(n) - float(dof))) 214 215 # Return the SD. 216 return sd

217 218

219 -def multifit_covar(J=None, epsrel=0.0, weights=None):

220 """This is the implementation of the multifit covariance. 221 222 This is inspired from GNU Scientific Library (GSL). 223 224 This function uses the Jacobian matrix J to compute the covariance matrix of the best-fit parameters, covar. 225 226 The parameter 'epsrel' is used to remove linear-dependent columns when J is rank deficient. 227 228 The weighting matrix 'W', is a square symmetric matrix. For independent measurements, this is a diagonal matrix. Larger values indicate greater significance. It is formed by multiplying and Identity matrix with the supplied weights vector:: 229 230 W = I. w 231 232 The weights should normally be supplied as a vector: 1 / errors^2. 233 234 The covariance matrix is given by:: 235 236 covar = (J^T.W.J)^{-1} , 237 238 and is computed by QR decomposition of J with column-pivoting. Any columns of R which satisfy:: 239 240 |R_{kk}| <= epsrel |R_{11}| , 241 242 are considered linearly-dependent and are excluded from the covariance matrix (the corresponding rows and columns of the covariance matrix are set to zero). If the minimisation uses the weighted least-squares function:: 243 244 f_i = (Y(x, t_i) - y_i) / sigma_i , 245 246 then the covariance matrix above gives the statistical error on the best-fit parameters resulting from the Gaussian errors 'sigma_i' on the underlying data 'y_i'. 247 248 This can be verified from the relation 'd_f = J d_c' and the fact that the fluctuations in 'f' from the data 'y_i' are normalised by 'sigma_i' and so satisfy:: 249 250 <d_f d_f^T> = I. , 251 252 For an unweighted least-squares function f_i = (Y(x, t_i) - y_i) the covariance matrix above should be multiplied by the variance of the residuals about the best-fit:: 253 254 sigma^2 = sum ( (y_i - Y(x, t_i))^2 / (n-p) ) , 255 256 to give the variance-covariance matrix sigma^2 C. This estimates the statistical error on the best-fit parameters from the scatter of the underlying data. 257 258 Links 259 ===== 260 261 More information ca be found here: 262 263 - U{GSL - GNU Scientific Library<http://www.gnu.org/software/gsl/>} 264 - U{Manual: Overview<http://www.gnu.org/software/gsl/manual/gsl-ref_37.html#SEC510>} 265 - U{Manual: Computing the covariance matrix of best fit parameters<http://www.gnu.org/software/gsl/manual/gsl-ref_38.html#SEC528>} 266 - U{Other reference<http://www.orbitals.com/self/least/least.htm>} 267 268 @param J: The Jacobian matrix. 269 @type J: numpy array 270 @param epsrel: Any columns of R which satisfy |R_{kk}| <= epsrel |R_{11}| are considered linearly-dependent and are excluded from the covariance matrix, where the corresponding rows and columns of the covariance matrix are set to zero. 271 @type epsrel: float 272 @keyword weigths: The weigths which to scale with. Normally submitted as the 1 over standard deviation of the measured intensity values per time point in power 2. weigths = 1 / sd_i^2. 273 @type weigths: numpy array 274 @return: The co-variance matrix 275 @rtype: square numpy array 276 """ 277 278 # Weighting matrix. This is a square symmetric matrix. 279 # For independent measurements, this is a diagonal matrix. Larger values indicate greater significance. 280 281 # Make a square diagonal matrix. 282 eye_mat = eye(weights.shape[0]) 283 284 # Form weight matrix. 285 W = multiply(eye_mat, weights) 286 287 # The covariance matrix (sometimes referred to as the variance-covariance matrix), Qxx, is defined as: 288 # Qxx = (J^t W J)^(-1) 289 290 # Calculate step by step, by matrix multiplication. 291 Jt = transpose(J) 292 Jt_W = dot(Jt, W) 293 Jt_W_J = dot(Jt_W, J) 294 295 # Invert matrix by QR decomposition, to check columns of R which satisfy: |R_{kk}| <= epsrel |R_{11}| 296 Q, R = qr(Jt_W_J) 297 298 # Make the state ment matrix. 299 abs_epsrel_R11 = absolute( multiply(epsrel, R[0, 0]) ) 300 301 # Make and array of True/False statements. 302 # These are considered linearly-dependent and are excluded from the covariance matrix. 303 # The corresponding rows and columns of the covariance matrix are set to zero 304 epsrel_check = absolute(R) <= abs_epsrel_R11 305 306 # Form the covariance matrix. 307 Qxx = dot(inv(R), transpose(Q) ) 308 #Qxx2 = dot(inv(R), inv(Q) ) 309 #print(Qxx - Qxx2) 310 311 # Test direct invert matrix of matrix. 312 #Qxx_test = inv(Jt_W_J) 313 314 # Replace values in Covariance matrix with inf. 315 Qxx[epsrel_check] = 0.0 316 317 # Throw a warning, that some colums are considered linearly-dependent and are excluded from the covariance matrix. 318 # Only check for the diagonal, since the that holds the variance. 319 diag_epsrel_check = diag(epsrel_check) 320 321 # If any of the diagonals does not meet the epsrel condition. 322 if any(diag_epsrel_check): 323 for i in range(diag_epsrel_check.shape[0]): 324 abs_Rkk = absolute(R[i, i]) 325 if abs_Rkk <= abs_epsrel_R11: 326 warn(RelaxWarning("Co-Variance element k,k=%i was found to meet |R_{kk}| <= epsrel |R_{11}|, meaning %1.1f <= %1.3f * %1.1f , and is therefore determined to be linearly-dependent and are excluded from the covariance matrix by setting the value to 0.0." % (i+1, abs_Rkk, epsrel, abs_epsrel_R11/epsrel) )) 327 #print(cond(Jt_W_J) < 1./spacing(1.) ) 328 329 return Qxx

330

Source Code for Module lib.statistics