Package lib :: Module sequence
[hide private]
[frames] | no frames]

Source Code for Module lib.sequence

  1  from __future__ import absolute_import 
  2  ############################################################################### 
  3  #                                                                             # 
  4  # Copyright (C) 2003-2015 Edward d'Auvergne                                   # 
  5  #                                                                             # 
  6  # This file is part of the program relax (http://www.nmr-relax.com).          # 
  7  #                                                                             # 
  8  # This program is free software: you can redistribute it and/or modify        # 
  9  # it under the terms of the GNU General Public License as published by        # 
 10  # the Free Software Foundation, either version 3 of the License, or           # 
 11  # (at your option) any later version.                                         # 
 12  #                                                                             # 
 13  # This program is distributed in the hope that it will be useful,             # 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of              # 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               # 
 16  # GNU General Public License for more details.                                # 
 17  #                                                                             # 
 18  # You should have received a copy of the GNU General Public License           # 
 19  # along with this program.  If not, see <http://www.gnu.org/licenses/>.       # 
 20  #                                                                             # 
 21  ############################################################################### 
 22   
 23  # Module docstring. 
 24  """Module for handling the molecule, residue, and spin sequence data.""" 
 25   
 26  # Python module imports. 
 27  from warnings import warn 
 28   
 29  # relax module imports. 
 30  from lib.checks import Check 
 31  from lib.check_types import is_float 
 32  from lib.errors import RelaxError, RelaxInvalidSeqError 
 33  from lib.float import isFinite 
 34  from lib.io import extract_data, open_write_file, strip, write_data 
 35  from lib.selection import spin_id_to_data_list 
 36  from lib.warnings import RelaxWarning, RelaxFileEmptyWarning 
 37   
 38   
 39  # The 3 letter to 1 letter amino acid code table. 
 40  AA_CODES = { 
 41      "ALA": "A", 
 42      "ARG": "R", 
 43      "ASN": "N", 
 44      "ASP": "D", 
 45      "CYS": "C", 
 46      "GLU": "E", 
 47      "GLN": "Q", 
 48      "GLY": "G", 
 49      "HIS": "H", 
 50      "ILE": "I", 
 51      "LEU": "L", 
 52      "LYS": "K", 
 53      "MET": "M", 
 54      "PHE": "F", 
 55      "PRO": "P", 
 56      "SER": "S", 
 57      "THR": "T", 
 58      "TRP": "W", 
 59      "TYR": "Y", 
 60      "VAL": "V", 
 61  } 
 62   
 63   
64 -def aa_codes_three_to_one(code):
65 """Convert the given three letter amino acid code to the corresponding one letter code. 66 67 Any non-standard residues will be converted to '*'. 68 69 70 @param code: The three letter amino acid code to convert. 71 @type code: str 72 @return: The corresponding one letter amino acid code, or '*'. 73 @rtype: str 74 """ 75 76 # Convert to uppercase. 77 upper_code = code.upper() 78 79 # The code exists. 80 if upper_code in AA_CODES: 81 return AA_CODES[upper_code] 82 83 # No code. 84 return '*'
85 86
87 -def check_sequence_func(data, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None):
88 """Test if the sequence data is valid. 89 90 The only function this performs is to raise a RelaxError if the data is invalid. 91 92 93 @param data: The sequence data. 94 @type data: list of lists. 95 @keyword spin_id_col: The column containing the spin ID strings. 96 @type spin_id_col: int or None 97 @param mol_name_col: The column containing the molecule name information. 98 @type mol_name_col: int or None 99 @param res_name_col: The column containing the residue name information. 100 @type res_name_col: int or None 101 @param res_num_col: The column containing the residue number information. 102 @type res_num_col: int or None 103 @param spin_name_col: The column containing the spin name information. 104 @type spin_name_col: int or None 105 @param spin_num_col: The column containing the spin number information. 106 @type spin_num_col: int or None 107 """ 108 109 # Spin ID. 110 if spin_id_col: 111 if len(data) < spin_id_col: 112 return RelaxInvalidSeqError(data, "the Spin ID data is missing") 113 114 # Molecule name data. 115 if mol_name_col: 116 if len(data) < mol_name_col: 117 return RelaxInvalidSeqError(data, "the molecule name data is missing") 118 119 # Residue number data. 120 if res_num_col: 121 # No data in column. 122 if len(data) < res_num_col: 123 return RelaxInvalidSeqError(data, "the residue number data is missing") 124 125 # Bad data in column. 126 try: 127 res_num = eval(data[res_num_col-1]) 128 if not (res_num == None or isinstance(res_num, int)): 129 return RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 130 except: 131 return RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 132 133 # Residue name data. 134 if res_name_col: 135 if len(data) < res_name_col: 136 return RelaxInvalidSeqError(data, "the residue name data is missing") 137 138 # Spin number data. 139 if spin_num_col: 140 # No data in column. 141 if len(data) < spin_num_col: 142 return RelaxInvalidSeqError(data, "the spin number data is missing") 143 144 # Bad data in column. 145 try: 146 spin_num = eval(data[spin_num_col-1]) 147 if not (spin_num == None or isinstance(spin_num, int)): 148 return RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[spin_num_col-1]) 149 except: 150 return RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[spin_num_col-1]) 151 152 # Spin name data. 153 if spin_name_col: 154 if len(data) < spin_name_col: 155 return RelaxInvalidSeqError(data, "the spin name data is missing") 156 157 # Data. 158 if data_col: 159 if len(data) < data_col: 160 return RelaxInvalidSeqError(data, "the data is missing") 161 162 # Errors 163 if error_col: 164 if len(data) < error_col: 165 return RelaxInvalidSeqError(data, "the error data is missing")
166 167 # Create the checking object. 168 check_sequence = Check(check_sequence_func) 169 170
171 -def read_spin_data(file=None, dir=None, file_data=None, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None, sep=None, spin_id=None, raise_flag=True):
172 """Generator function for reading the spin specific data from file. 173 174 Description 175 =========== 176 177 This function reads a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 178 179 180 @keyword file: The name of the file to open. 181 @type file: str 182 @keyword dir: The directory containing the file (defaults to the current directory if None). 183 @type dir: str or None 184 @keyword file_data: An alternative to opening a file, if the data already exists in the correct format. The format is a list of lists where the first index corresponds to the row and the second the column. 185 @type file_data: list of lists 186 @keyword spin_id_col: The column containing the spin ID strings. If supplied, the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col arguments must be none. 187 @type spin_id_col: int or None 188 @keyword mol_name_col: The column containing the molecule name information. If supplied, spin_id_col must be None. 189 @type mol_name_col: int or None 190 @keyword res_name_col: The column containing the residue name information. If supplied, spin_id_col must be None. 191 @type res_name_col: int or None 192 @keyword res_num_col: The column containing the residue number information. If supplied, spin_id_col must be None. 193 @type res_num_col: int or None 194 @keyword spin_name_col: The column containing the spin name information. If supplied, spin_id_col must be None. 195 @type spin_name_col: int or None 196 @keyword spin_num_col: The column containing the spin number information. If supplied, spin_id_col must be None. 197 @type spin_num_col: int or None 198 @keyword data_col: The column containing the data. 199 @type data_col: int or None 200 @keyword error_col: The column containing the errors. 201 @type error_col: int or None 202 @keyword sep: The column separator which, if None, defaults to whitespace. 203 @type sep: str or None 204 @keyword spin_id: The spin ID string used to restrict data loading to a subset of all spins. 205 @type spin_id: None or str 206 @keyword raise_flag: A flag which if True will cause a RelaxError to be raised if no data can be found. 207 @type raise_flag: bool 208 @return: A list of the spin specific data is yielded. The format is a list consisting of the spin ID string, the data value (if data_col is give), and the error value (if error_col is given). If both data_col and error_col are None, then the spin ID string is simply yielded. 209 @rtype: str, list of [str, float], or list of [str, float, float] 210 """ 211 212 # Argument tests. 213 col_args = [spin_id_col, mol_name_col, res_name_col, res_num_col, spin_name_col, spin_num_col, data_col, error_col] 214 col_arg_names = ['spin_id_col', 'mol_name_col', 'res_name_col', 'res_num_col', 'spin_name_col', 'spin_num_col', 'data_col', 'error_col'] 215 for i in range(len(col_args)): 216 if col_args[i] == 0: 217 raise RelaxError("The '%s' argument cannot be zero, column numbering starts at one." % col_arg_names[i]) 218 if spin_id_col and (mol_name_col or res_name_col or res_num_col or spin_name_col or spin_num_col): 219 raise RelaxError("If the 'spin_id_col' argument has been supplied, then the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col must all be set to None.") 220 221 # Minimum number of columns. 222 min_col_num = max([_f for _f in [spin_id_col, mol_name_col, res_num_col, res_name_col, spin_num_col, spin_name_col, data_col, error_col] if _f]) 223 224 # Extract the data from the file. 225 if not file_data: 226 # Extract. 227 file_data = extract_data(file, dir, sep=sep) 228 229 # Strip the data of all comments and empty lines. 230 if spin_id_col != None: 231 file_data = strip(file_data, comments=False) 232 else: 233 file_data = strip(file_data) 234 235 # No data! 236 if not file_data: 237 warn(RelaxFileEmptyWarning(file)) 238 return 239 240 # Yield the data, spin by spin. 241 missing_data = True 242 for line in file_data: 243 # Convert the spin IDs. 244 if spin_id_col != None and line[spin_id_col-1][0] in ["\"", "\'"]: 245 line[spin_id_col-1] = eval(line[spin_id_col-1]) 246 247 # Convert. 248 # Validate the sequence. 249 if not check_sequence(line, spin_id_col=spin_id_col, mol_name_col=mol_name_col, res_num_col=res_num_col, res_name_col=res_name_col, spin_num_col=spin_num_col, spin_name_col=spin_name_col, data_col=data_col, error_col=error_col, escalate=1): 250 continue 251 252 # Get the spin data from the ID. 253 if spin_id_col: 254 # Invalid spin ID. 255 if line[spin_id_col-1] == '#': 256 warn(RelaxWarning("Invalid spin ID, skipping the line %s" % line)) 257 continue 258 259 mol_name, res_num, res_name, spin_num, spin_name = spin_id_to_data_list(line[spin_id_col-1]) 260 261 # Convert the spin data. 262 else: 263 # The molecule. 264 mol_name = None 265 if mol_name_col != None and line[mol_name_col-1] != 'None': 266 mol_name = line[mol_name_col-1] 267 268 # The residue number, catching bad values. 269 res_num = None 270 if res_num_col != None: 271 try: 272 if line[res_num_col-1] == 'None': 273 res_num = None 274 else: 275 res_num = int(line[res_num_col-1]) 276 except ValueError: 277 warn(RelaxWarning("Invalid residue number, skipping the line %s" % line)) 278 continue 279 280 # The residue name. 281 res_name = None 282 if res_name_col != None and line[res_name_col-1] != 'None': 283 res_name = line[res_name_col-1] 284 285 # The spin number, catching bad values. 286 spin_num = None 287 if spin_num_col != None: 288 try: 289 if line[spin_num_col-1] == 'None': 290 spin_num = None 291 else: 292 spin_num = int(line[spin_num_col-1]) 293 except ValueError: 294 warn(RelaxWarning("Invalid spin number, skipping the line %s" % line)) 295 continue 296 297 # The spin name. 298 spin_name = None 299 if spin_name_col != None and line[spin_name_col-1] != 'None': 300 spin_name = line[spin_name_col-1] 301 302 # Convert the data. 303 value = None 304 if data_col != None: 305 try: 306 # None. 307 if line[data_col-1] == 'None': 308 value = None 309 310 # A float. 311 else: 312 value = float(line[data_col-1]) 313 314 # If it a float, test if is nan. 315 if not isFinite(value): 316 warn(RelaxWarning("The value is not finite, skipping the line %s" % line)) 317 continue 318 319 # Bad data. 320 except ValueError: 321 warn(RelaxWarning("Invalid data, skipping the line %s" % line)) 322 continue 323 324 # Convert the errors. 325 error = None 326 if error_col != None: 327 try: 328 # None. 329 if line[error_col-1] == 'None': 330 error = None 331 332 # A float. 333 else: 334 error = float(line[error_col-1]) 335 336 # If it a float, test if is nan. 337 if not isFinite(error): 338 warn(RelaxWarning("The error is not finite, skipping the line %s" % line)) 339 continue 340 341 # Bad data. 342 except ValueError: 343 warn(RelaxWarning("Invalid errors, skipping the line %s" % line)) 344 continue 345 346 # Right, data is OK and exists. 347 missing_data = False 348 349 # Yield the data. 350 if data_col and error_col: 351 yield mol_name, res_num, res_name, spin_num, spin_name, value, error 352 elif data_col: 353 yield mol_name, res_num, res_name, spin_num, spin_name, value 354 elif error_col: 355 yield mol_name, res_num, res_name, spin_num, spin_name, error 356 else: 357 yield mol_name, res_num, res_name, spin_num, spin_name 358 359 # Hmmm, no data! 360 if raise_flag and missing_data: 361 raise RelaxError("No corresponding data could be found within the file.")
362 363
364 -def write_spin_data(file, dir=None, sep=None, spin_ids=None, mol_names=None, res_nums=None, res_names=None, spin_nums=None, spin_names=None, force=False, data=None, data_name=None, error=None, error_name=None, float_format="%20.15g"):
365 """Generator function for reading the spin specific data from file. 366 367 Description 368 =========== 369 370 This function writes a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 371 372 373 @param file: The name of the file to write the data to (or alternatively an already opened file object). 374 @type file: str or file object 375 @keyword dir: The directory to place the file into (defaults to the current directory if None and the file argument is not a file object). 376 @type dir: str or None 377 @keyword sep: The column separator which, if None, defaults to whitespace. 378 @type sep: str or None 379 @keyword spin_ids: The list of spin ID strings. 380 @type spin_ids: None or list of str 381 @keyword mol_names: The list of molecule names. 382 @type mol_names: None or list of str 383 @keyword res_nums: The list of residue numbers. 384 @type res_nums: None or list of int 385 @keyword res_names: The list of residue names. 386 @type res_names: None or list of str 387 @keyword spin_nums: The list of spin numbers. 388 @type spin_nums: None or list of int 389 @keyword spin_names: The list of spin names. 390 @type spin_names: None or list of str 391 @keyword force: A flag which if True will cause an existing file to be overwritten. 392 @type force: bool 393 @keyword data: A list of the data to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. 394 @type data: list or list of lists 395 @keyword data_name: A name corresponding to the data argument. If the data argument is a list of lists, then this must also be a list with the same length as the second dimension of the data arg. 396 @type data_name: str or list of str 397 @keyword error: A list of the errors to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. These will be inter-dispersed between the data columns, if the data is given. If the data arg is not None, then this must have the same dimensions as that object. 398 @type error: list or list of lists 399 @keyword error_name: A name corresponding to the error argument. If the error argument is a list of lists, then this must also be a list with the same length at the second dimension of the error arg. 400 @type error_name: str or list of str 401 @keyword float_format: A float formatting string to use for the data and error whenever a float is found. 402 @type float_format: str 403 """ 404 405 # Data argument tests. 406 if data: 407 # Data is a list of lists. 408 if isinstance(data[0], list): 409 # Data and data_name don't match. 410 if not isinstance(data_name, list): 411 raise RelaxError("The data_name arg '%s' must be a list as the data argument is a list of lists." % data_name) 412 413 # Error doesn't match. 414 if error and (len(data) != len(error) or len(data[0]) != len(error[0])): 415 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 416 417 # Data is a simple list. 418 else: 419 # Data and data_name don't match. 420 if not isinstance(data_name, str): 421 raise RelaxError("The data_name arg '%s' must be a string as the data argument is a simple list." % data_name) 422 423 # Error doesn't match. 424 if error and len(data) != len(error): 425 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 426 427 # Error argument tests. 428 if error: 429 # Error is a list of lists. 430 if isinstance(error[0], list): 431 # Error and error_name don't match. 432 if not isinstance(error_name, list): 433 raise RelaxError("The error_name arg '%s' must be a list as the error argument is a list of lists." % error_name) 434 435 # Error is a simple list. 436 else: 437 # Error and error_name don't match. 438 if not isinstance(error_name, str): 439 raise RelaxError("The error_name arg '%s' must be a string as the error argument is a simple list." % error_name) 440 441 # Number of spins check. 442 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 443 arg_names = ['spin_ids', 'mol_names', 'res_nums', 'res_names', 'spin_nums', 'spin_names'] 444 N = None 445 first_arg = None 446 first_arg_name = None 447 for i in range(len(args)): 448 if isinstance(args[i], list): 449 # First list match. 450 if N == None: 451 N = len(args[i]) 452 first_arg = args[i] 453 first_arg_name = arg_names[i] 454 455 # Length check. 456 if len(args[i]) != N: 457 raise RelaxError("The %s and %s arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, arg_names[i], len(first_arg), len(args[i]))) 458 459 # Nothing?!? 460 if N == None: 461 raise RelaxError("No spin ID data is present.") 462 463 # Data and error length check. 464 if data and len(data) != N: 465 raise RelaxError("The %s and data arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(data))) 466 if error and len(error) != N: 467 raise RelaxError("The %s and error arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(error))) 468 469 # The spin arguments. 470 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 471 arg_names = ['spin_id', 'mol_name', 'res_num', 'res_name', 'spin_num', 'spin_name'] 472 473 474 # Init. 475 headings = [] 476 file_data = [] 477 478 # Headers - the spin ID info. 479 for i in range(len(args)): 480 if args[i]: 481 headings.append(arg_names[i]) 482 483 # Headers - the data. 484 if data: 485 # List of lists. 486 if isinstance(data[0], list): 487 # Loop over the list. 488 for i in range(len(data[0])): 489 # The data. 490 headings.append(data_name[i]) 491 492 # The error. 493 if error: 494 headings.append(error_name[i]) 495 496 # Simple list. 497 else: 498 # The data. 499 headings.append(data_name) 500 501 # The error. 502 if error: 503 headings.append(error_name) 504 505 # Headers - only errors. 506 elif error: 507 # List of lists. 508 if isinstance(error[0], list): 509 for i in range(len(error[0])): 510 headings.append(error_name[i]) 511 512 # Simple list. 513 else: 514 headings.append(error_name) 515 516 # No headings. 517 if headings == []: 518 headings = None 519 520 # Spin specific data. 521 for spin_index in range(N): 522 # Append a new data row. 523 file_data.append([]) 524 525 # The spin ID info. 526 for i in range(len(args)): 527 if args[i]: 528 value = args[i][spin_index] 529 if not isinstance(value, str): 530 value = repr(value) 531 file_data[-1].append(value) 532 533 # The data. 534 if data: 535 # List of lists. 536 if isinstance(data[0], list): 537 # Loop over the list. 538 for i in range(len(data[0])): 539 # The data. 540 if is_float(data[spin_index][i]): 541 file_data[-1].append(float_format % data[spin_index][i]) 542 else: 543 file_data[-1].append(repr(data[spin_index][i])) 544 545 # The error. 546 if error: 547 if is_float(error[spin_index][i]): 548 file_data[-1].append(float_format % error[spin_index][i]) 549 else: 550 file_data[-1].append(repr(error[spin_index][i])) 551 552 # Simple list. 553 else: 554 # The data. 555 if is_float(data[spin_index]): 556 file_data[-1].append(float_format % data[spin_index]) 557 else: 558 file_data[-1].append(repr(data[spin_index])) 559 560 # The error. 561 if error: 562 if is_float(error[spin_index]): 563 file_data[-1].append(float_format % error[spin_index]) 564 else: 565 file_data[-1].append(repr(error[spin_index])) 566 567 # Only errors. 568 elif error: 569 # List of lists. 570 if isinstance(error[0], list): 571 for i in range(len(error[0])): 572 file_data[-1].append(repr(error[spin_index][i])) 573 574 # Simple list. 575 else: 576 file_data[-1].append(repr(error[spin_index])) 577 578 # No data to write, so do nothing! 579 if file_data == [] or file_data == [[]]: 580 return 581 582 # Open the file for writing. 583 file = open_write_file(file_name=file, dir=dir, force=force) 584 585 # Write out the file data. 586 write_data(out=file, headings=headings, data=file_data, sep=sep)
587