lib.sequence

1 from __future__ import absolute_import 2 ############################################################################### 3 # # 4 # Copyright (C) 2003-2014 Edward d'Auvergne # 5 # # 6 # This file is part of the program relax (http://www.nmr-relax.com). # 7 # # 8 # This program is free software: you can redistribute it and/or modify # 9 # it under the terms of the GNU General Public License as published by # 10 # the Free Software Foundation, either version 3 of the License, or # 11 # (at your option) any later version. # 12 # # 13 # This program is distributed in the hope that it will be useful, # 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 16 # GNU General Public License for more details. # 17 # # 18 # You should have received a copy of the GNU General Public License # 19 # along with this program. If not, see <http://www.gnu.org/licenses/>. # 20 # # 21 ############################################################################### 22 23 # Module docstring. 24 """Module for handling the molecule, residue, and spin sequence data.""" 25 26 # Python module imports. 27 import sys 28 from warnings import warn 29 30 # relax module imports. 31 from lib.check_types import is_float 32 from lib.errors import RelaxError, RelaxInvalidSeqError 33 from lib.io import extract_data, open_write_file, strip, write_data 34 from lib.selection import spin_id_to_data_list 35 from lib.warnings import RelaxWarning, RelaxFileEmptyWarning 36 37

38 -def read_spin_data(file=None, dir=None, file_data=None, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None, sep=None, spin_id=None):

39 """Generator function for reading the spin specific data from file. 40 41 Description 42 =========== 43 44 This function reads a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 45 46 47 @keyword file: The name of the file to open. 48 @type file: str 49 @keyword dir: The directory containing the file (defaults to the current directory if None). 50 @type dir: str or None 51 @keyword file_data: An alternative to opening a file, if the data already exists in the correct format. The format is a list of lists where the first index corresponds to the row and the second the column. 52 @type file_data: list of lists 53 @keyword spin_id_col: The column containing the spin ID strings. If supplied, the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col arguments must be none. 54 @type spin_id_col: int or None 55 @keyword mol_name_col: The column containing the molecule name information. If supplied, spin_id_col must be None. 56 @type mol_name_col: int or None 57 @keyword res_name_col: The column containing the residue name information. If supplied, spin_id_col must be None. 58 @type res_name_col: int or None 59 @keyword res_num_col: The column containing the residue number information. If supplied, spin_id_col must be None. 60 @type res_num_col: int or None 61 @keyword spin_name_col: The column containing the spin name information. If supplied, spin_id_col must be None. 62 @type spin_name_col: int or None 63 @keyword spin_num_col: The column containing the spin number information. If supplied, spin_id_col must be None. 64 @type spin_num_col: int or None 65 @keyword data_col: The column containing the data. 66 @type data_col: int or None 67 @keyword error_col: The column containing the errors. 68 @type error_col: int or None 69 @keyword sep: The column separator which, if None, defaults to whitespace. 70 @type sep: str or None 71 @keyword spin_id: The spin ID string used to restrict data loading to a subset of all spins. 72 @type spin_id: None or str 73 @return: A list of the spin specific data is yielded. The format is a list consisting of the spin ID string, the data value (if data_col is give), and the error value (if error_col is given). If both data_col and error_col are None, then the spin ID string is simply yielded. 74 @rtype: str, list of [str, float], or list of [str, float, float] 75 """ 76 77 # Argument tests. 78 col_args = [spin_id_col, mol_name_col, res_name_col, res_num_col, spin_name_col, spin_num_col, data_col, error_col] 79 col_arg_names = ['spin_id_col', 'mol_name_col', 'res_name_col', 'res_num_col', 'spin_name_col', 'spin_num_col', 'data_col', 'error_col'] 80 for i in range(len(col_args)): 81 if col_args[i] == 0: 82 raise RelaxError("The '%s' argument cannot be zero, column numbering starts at one." % col_arg_names[i]) 83 if spin_id_col and (mol_name_col or res_name_col or res_num_col or spin_name_col or spin_num_col): 84 raise RelaxError("If the 'spin_id_col' argument has been supplied, then the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col must all be set to None.") 85 86 # Minimum number of columns. 87 min_col_num = max(filter(None, [spin_id_col, mol_name_col, res_num_col, res_name_col, spin_num_col, spin_name_col, data_col, error_col])) 88 89 # Extract the data from the file. 90 if not file_data: 91 # Extract. 92 file_data = extract_data(file, dir) 93 94 # Strip the data of all comments and empty lines. 95 if spin_id_col != None: 96 file_data = strip(file_data, comments=False) 97 else: 98 file_data = strip(file_data) 99 100 # No data! 101 if not file_data: 102 warn(RelaxFileEmptyWarning(file)) 103 return 104 105 # Yield the data, spin by spin. 106 missing_data = True 107 for line in file_data: 108 # Convert the spin IDs. 109 if spin_id_col != None and line[spin_id_col-1][0] in ["\"", "\'"]: 110 line[spin_id_col-1] = eval(line[spin_id_col-1]) 111 112 # Convert. 113 # Validate the sequence. 114 try: 115 validate_sequence(line, spin_id_col=spin_id_col, mol_name_col=mol_name_col, res_num_col=res_num_col, res_name_col=res_name_col, spin_num_col=spin_num_col, spin_name_col=spin_name_col, data_col=data_col, error_col=error_col) 116 except RelaxInvalidSeqError: 117 # Extract the message string, without the RelaxError bit. 118 msg = sys.exc_info()[1] 119 string = msg.__str__()[12:-1] 120 121 # Give a warning. 122 warn(RelaxWarning(string)) 123 124 # Skip the line. 125 continue 126 127 # Get the spin data from the ID. 128 if spin_id_col: 129 # Invalid spin ID. 130 if line[spin_id_col-1] == '#': 131 warn(RelaxWarning("Invalid spin ID, skipping the line %s" % line)) 132 continue 133 134 mol_name, res_num, res_name, spin_num, spin_name = spin_id_to_data_list(line[spin_id_col-1]) 135 136 # Convert the spin data. 137 else: 138 # The molecule. 139 mol_name = None 140 if mol_name_col != None and line[mol_name_col-1] != 'None': 141 mol_name = line[mol_name_col-1] 142 143 # The residue number, catching bad values. 144 res_num = None 145 if res_num_col != None: 146 try: 147 if line[res_num_col-1] == 'None': 148 res_num = None 149 else: 150 res_num = int(line[res_num_col-1]) 151 except ValueError: 152 warn(RelaxWarning("Invalid residue number, skipping the line %s" % line)) 153 continue 154 155 # The residue name. 156 res_name = None 157 if res_name_col != None and line[res_name_col-1] != 'None': 158 res_name = line[res_name_col-1] 159 160 # The spin number, catching bad values. 161 spin_num = None 162 if spin_num_col != None: 163 try: 164 if line[spin_num_col-1] == 'None': 165 spin_num = None 166 else: 167 spin_num = int(line[spin_num_col-1]) 168 except ValueError: 169 warn(RelaxWarning("Invalid spin number, skipping the line %s" % line)) 170 continue 171 172 # The spin name. 173 spin_name = None 174 if spin_name_col != None and line[spin_name_col-1] != 'None': 175 spin_name = line[spin_name_col-1] 176 177 # Convert the data. 178 value = None 179 if data_col != None: 180 try: 181 # None. 182 if line[data_col-1] == 'None': 183 value = None 184 185 # A float. 186 else: 187 value = float(line[data_col-1]) 188 189 # Bad data. 190 except ValueError: 191 warn(RelaxWarning("Invalid data, skipping the line %s" % line)) 192 continue 193 194 # Convert the errors. 195 error = None 196 if error_col != None: 197 try: 198 # None. 199 if line[error_col-1] == 'None': 200 error = None 201 202 # A float. 203 else: 204 error = float(line[error_col-1]) 205 206 # Bad data. 207 except ValueError: 208 warn(RelaxWarning("Invalid errors, skipping the line %s" % line)) 209 continue 210 211 # Right, data is OK and exists. 212 missing_data = False 213 214 # Yield the data. 215 if data_col and error_col: 216 yield mol_name, res_num, res_name, spin_num, spin_name, value, error 217 elif data_col: 218 yield mol_name, res_num, res_name, spin_num, spin_name, value 219 elif error_col: 220 yield mol_name, res_num, res_name, spin_num, spin_name, error 221 else: 222 yield mol_name, res_num, res_name, spin_num, spin_name 223 224 # Hmmm, no data! 225 if missing_data: 226 raise RelaxError("No corresponding data could be found within the file.")

227 228

229 -def validate_sequence(data, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None):

230 """Test if the sequence data is valid. 231 232 The only function this performs is to raise a RelaxError if the data is invalid. 233 234 235 @param data: The sequence data. 236 @type data: list of lists. 237 @keyword spin_id_col: The column containing the spin ID strings. 238 @type spin_id_col: int or None 239 @param mol_name_col: The column containing the molecule name information. 240 @type mol_name_col: int or None 241 @param res_name_col: The column containing the residue name information. 242 @type res_name_col: int or None 243 @param res_num_col: The column containing the residue number information. 244 @type res_num_col: int or None 245 @param spin_name_col: The column containing the spin name information. 246 @type spin_name_col: int or None 247 @param spin_num_col: The column containing the spin number information. 248 @type spin_num_col: int or None 249 """ 250 251 # Spin ID. 252 if spin_id_col: 253 if len(data) < spin_id_col: 254 raise RelaxInvalidSeqError(data, "the Spin ID data is missing") 255 256 # Molecule name data. 257 if mol_name_col: 258 if len(data) < mol_name_col: 259 raise RelaxInvalidSeqError(data, "the molecule name data is missing") 260 261 # Residue number data. 262 if res_num_col: 263 # No data in column. 264 if len(data) < res_num_col: 265 raise RelaxInvalidSeqError(data, "the residue number data is missing") 266 267 # Bad data in column. 268 try: 269 res_num = eval(data[res_num_col-1]) 270 if not (res_num == None or isinstance(res_num, int)): 271 raise ValueError 272 except: 273 raise RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 274 275 # Residue name data. 276 if res_name_col: 277 if len(data) < res_name_col: 278 raise RelaxInvalidSeqError(data, "the residue name data is missing") 279 280 # Spin number data. 281 if spin_num_col: 282 # No data in column. 283 if len(data) < spin_num_col: 284 raise RelaxInvalidSeqError(data, "the spin number data is missing") 285 286 # Bad data in column. 287 try: 288 res_num = eval(data[res_num_col-1]) 289 if not (res_num == None or isinstance(res_num, int)): 290 raise ValueError 291 except: 292 raise RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[res_num_col-1]) 293 294 # Spin name data. 295 if spin_name_col: 296 if len(data) < spin_name_col: 297 raise RelaxInvalidSeqError(data, "the spin name data is missing") 298 299 # Data. 300 if data_col: 301 if len(data) < data_col: 302 raise RelaxInvalidSeqError(data, "the data is missing") 303 304 # Errors 305 if error_col: 306 if len(data) < error_col: 307 raise RelaxInvalidSeqError(data, "the error data is missing")

308 309

310 -def write_spin_data(file, dir=None, sep=None, spin_ids=None, mol_names=None, res_nums=None, res_names=None, spin_nums=None, spin_names=None, force=False, data=None, data_name=None, error=None, error_name=None, float_format="%20.15g"):

311 """Generator function for reading the spin specific data from file. 312 313 Description 314 =========== 315 316 This function writes a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 317 318 319 @param file: The name of the file to write the data to (or alternatively an already opened file object). 320 @type file: str or file object 321 @keyword dir: The directory to place the file into (defaults to the current directory if None and the file argument is not a file object). 322 @type dir: str or None 323 @keyword sep: The column separator which, if None, defaults to whitespace. 324 @type sep: str or None 325 @keyword spin_ids: The list of spin ID strings. 326 @type spin_ids: None or list of str 327 @keyword mol_names: The list of molecule names. 328 @type mol_names: None or list of str 329 @keyword res_nums: The list of residue numbers. 330 @type res_nums: None or list of int 331 @keyword res_names: The list of residue names. 332 @type res_names: None or list of str 333 @keyword spin_nums: The list of spin numbers. 334 @type spin_nums: None or list of int 335 @keyword spin_names: The list of spin names. 336 @type spin_names: None or list of str 337 @keyword force: A flag which if True will cause an existing file to be overwritten. 338 @type force: bool 339 @keyword data: A list of the data to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. 340 @type data: list or list of lists 341 @keyword data_name: A name corresponding to the data argument. If the data argument is a list of lists, then this must also be a list with the same length as the second dimension of the data arg. 342 @type data_name: str or list of str 343 @keyword error: A list of the errors to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. These will be inter-dispersed between the data columns, if the data is given. If the data arg is not None, then this must have the same dimensions as that object. 344 @type error: list or list of lists 345 @keyword error_name: A name corresponding to the error argument. If the error argument is a list of lists, then this must also be a list with the same length at the second dimension of the error arg. 346 @type error_name: str or list of str 347 @keyword float_format: A float formatting string to use for the data and error whenever a float is found. 348 @type float_format: str 349 """ 350 351 # Data argument tests. 352 if data: 353 # Data is a list of lists. 354 if isinstance(data[0], list): 355 # Data and data_name don't match. 356 if not isinstance(data_name, list): 357 raise RelaxError("The data_name arg '%s' must be a list as the data argument is a list of lists." % data_name) 358 359 # Error doesn't match. 360 if error and (len(data) != len(error) or len(data[0]) != len(error[0])): 361 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 362 363 # Data is a simple list. 364 else: 365 # Data and data_name don't match. 366 if not isinstance(data_name, str): 367 raise RelaxError("The data_name arg '%s' must be a string as the data argument is a simple list." % data_name) 368 369 # Error doesn't match. 370 if error and len(data) != len(error): 371 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 372 373 # Error argument tests. 374 if error: 375 # Error is a list of lists. 376 if isinstance(error[0], list): 377 # Error and error_name don't match. 378 if not isinstance(error_name, list): 379 raise RelaxError("The error_name arg '%s' must be a list as the error argument is a list of lists." % error_name) 380 381 # Error is a simple list. 382 else: 383 # Error and error_name don't match. 384 if not isinstance(error_name, str): 385 raise RelaxError("The error_name arg '%s' must be a string as the error argument is a simple list." % error_name) 386 387 # Number of spins check. 388 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 389 arg_names = ['spin_ids', 'mol_names', 'res_nums', 'res_names', 'spin_nums', 'spin_names'] 390 N = None 391 first_arg = None 392 first_arg_name = None 393 for i in range(len(args)): 394 if isinstance(args[i], list): 395 # First list match. 396 if N == None: 397 N = len(args[i]) 398 first_arg = args[i] 399 first_arg_name = arg_names[i] 400 401 # Length check. 402 if len(args[i]) != N: 403 raise RelaxError("The %s and %s arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, arg_names[i], len(first_arg), len(args[i]))) 404 405 # Nothing?!? 406 if N == None: 407 raise RelaxError("No spin ID data is present.") 408 409 # Data and error length check. 410 if data and len(data) != N: 411 raise RelaxError("The %s and data arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(data))) 412 if error and len(error) != N: 413 raise RelaxError("The %s and error arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(error))) 414 415 # The spin arguments. 416 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 417 arg_names = ['spin_id', 'mol_name', 'res_num', 'res_name', 'spin_num', 'spin_name'] 418 419 420 # Init. 421 headings = [] 422 file_data = [] 423 424 # Headers - the spin ID info. 425 for i in range(len(args)): 426 if args[i]: 427 headings.append(arg_names[i]) 428 429 # Headers - the data. 430 if data: 431 # List of lists. 432 if isinstance(data[0], list): 433 # Loop over the list. 434 for i in range(len(data[0])): 435 # The data. 436 headings.append(data_name[i]) 437 438 # The error. 439 if error: 440 headings.append(error_name[i]) 441 442 # Simple list. 443 else: 444 # The data. 445 headings.append(data_name) 446 447 # The error. 448 if error: 449 headings.append(error_name) 450 451 # Headers - only errors. 452 elif error: 453 # List of lists. 454 if isinstance(error[0], list): 455 for i in range(len(error[0])): 456 headings.append(error_name[i]) 457 458 # Simple list. 459 else: 460 headings.append(error_name) 461 462 # No headings. 463 if headings == []: 464 headings = None 465 466 # Spin specific data. 467 for spin_index in range(N): 468 # Append a new data row. 469 file_data.append([]) 470 471 # The spin ID info. 472 for i in range(len(args)): 473 if args[i]: 474 value = args[i][spin_index] 475 if not isinstance(value, str): 476 value = repr(value) 477 file_data[-1].append(value) 478 479 # The data. 480 if data: 481 # List of lists. 482 if isinstance(data[0], list): 483 # Loop over the list. 484 for i in range(len(data[0])): 485 # The data. 486 if is_float(data[spin_index][i]): 487 file_data[-1].append(float_format % data[spin_index][i]) 488 else: 489 file_data[-1].append(repr(data[spin_index][i])) 490 491 # The error. 492 if error: 493 if is_float(error[spin_index][i]): 494 file_data[-1].append(float_format % error[spin_index][i]) 495 else: 496 file_data[-1].append(repr(error[spin_index][i])) 497 498 # Simple list. 499 else: 500 # The data. 501 if is_float(data[spin_index]): 502 file_data[-1].append(float_format % data[spin_index]) 503 else: 504 file_data[-1].append(repr(data[spin_index])) 505 506 # The error. 507 if error: 508 if is_float(error[spin_index]): 509 file_data[-1].append(float_format % error[spin_index]) 510 else: 511 file_data[-1].append(repr(error[spin_index])) 512 513 # Only errors. 514 elif error: 515 # List of lists. 516 if isinstance(error[0], list): 517 for i in range(len(error[0])): 518 file_data[-1].append(repr(error[spin_index][i])) 519 520 # Simple list. 521 else: 522 file_data[-1].append(repr(error[spin_index])) 523 524 # No data to write, so do nothing! 525 if file_data == [] or file_data == [[]]: 526 return 527 528 # Open the file for writing. 529 file = open_write_file(file_name=file, dir=dir, force=force) 530 531 # Write out the file data. 532 write_data(out=file, headings=headings, data=file_data, sep=sep)

533

Source Code for Module lib.sequence