Package lib :: Module sequence
[hide private]
[frames] | no frames]

Source Code for Module lib.sequence

  1  from __future__ import absolute_import 
  2  ############################################################################### 
  3  #                                                                             # 
  4  # Copyright (C) 2003-2004,2006-2009,2011-2015 Edward d'Auvergne               # 
  5  # Copyright (C) 2015 Troels E. Linnet                                         # 
  6  #                                                                             # 
  7  # This file is part of the program relax (http://www.nmr-relax.com).          # 
  8  #                                                                             # 
  9  # This program is free software: you can redistribute it and/or modify        # 
 10  # it under the terms of the GNU General Public License as published by        # 
 11  # the Free Software Foundation, either version 3 of the License, or           # 
 12  # (at your option) any later version.                                         # 
 13  #                                                                             # 
 14  # This program is distributed in the hope that it will be useful,             # 
 15  # but WITHOUT ANY WARRANTY; without even the implied warranty of              # 
 16  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               # 
 17  # GNU General Public License for more details.                                # 
 18  #                                                                             # 
 19  # You should have received a copy of the GNU General Public License           # 
 20  # along with this program.  If not, see <http://www.gnu.org/licenses/>.       # 
 21  #                                                                             # 
 22  ############################################################################### 
 23   
 24  # Module docstring. 
 25  """Module for handling the molecule, residue, and spin sequence data.""" 
 26   
 27  # Python module imports. 
 28  from warnings import warn 
 29   
 30  # relax module imports. 
 31  from lib.checks import Check 
 32  from lib.check_types import is_float 
 33  from lib.errors import RelaxError, RelaxInvalidSeqError 
 34  from lib.float import isFinite 
 35  from lib.io import extract_data, open_write_file, strip, write_data 
 36  from lib.selection import spin_id_to_data_list 
 37  from lib.warnings import RelaxWarning, RelaxFileEmptyWarning 
 38   
 39   
 40  # The 3 letter to 1 letter amino acid code table. 
 41  AA_CODES = { 
 42      "ALA": "A", 
 43      "ARG": "R", 
 44      "ASN": "N", 
 45      "ASP": "D", 
 46      "CYS": "C", 
 47      "GLU": "E", 
 48      "GLN": "Q", 
 49      "GLY": "G", 
 50      "HIS": "H", 
 51      "ILE": "I", 
 52      "LEU": "L", 
 53      "LYS": "K", 
 54      "MET": "M", 
 55      "PHE": "F", 
 56      "PRO": "P", 
 57      "SER": "S", 
 58      "THR": "T", 
 59      "TRP": "W", 
 60      "TYR": "Y", 
 61      "VAL": "V", 
 62  } 
 63   
 64   
65 -def aa_codes_three_to_one(code):
66 """Convert the given three letter amino acid code to the corresponding one letter code. 67 68 Any non-standard residues will be converted to '*'. 69 70 71 @param code: The three letter amino acid code to convert. 72 @type code: str 73 @return: The corresponding one letter amino acid code, or '*'. 74 @rtype: str 75 """ 76 77 # Convert to uppercase. 78 upper_code = code.upper() 79 80 # The code exists. 81 if upper_code in AA_CODES: 82 return AA_CODES[upper_code] 83 84 # No code. 85 return '*'
86 87
88 -def check_sequence_func(data, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None):
89 """Test if the sequence data is valid. 90 91 The only function this performs is to raise a RelaxError if the data is invalid. 92 93 94 @param data: The sequence data. 95 @type data: list of lists. 96 @keyword spin_id_col: The column containing the spin ID strings. 97 @type spin_id_col: int or None 98 @param mol_name_col: The column containing the molecule name information. 99 @type mol_name_col: int or None 100 @param res_name_col: The column containing the residue name information. 101 @type res_name_col: int or None 102 @param res_num_col: The column containing the residue number information. 103 @type res_num_col: int or None 104 @param spin_name_col: The column containing the spin name information. 105 @type spin_name_col: int or None 106 @param spin_num_col: The column containing the spin number information. 107 @type spin_num_col: int or None 108 """ 109 110 # Spin ID. 111 if spin_id_col: 112 if len(data) < spin_id_col: 113 return RelaxInvalidSeqError(data, "the Spin ID data is missing") 114 115 # Molecule name data. 116 if mol_name_col: 117 if len(data) < mol_name_col: 118 return RelaxInvalidSeqError(data, "the molecule name data is missing") 119 120 # Residue number data. 121 if res_num_col: 122 # No data in column. 123 if len(data) < res_num_col: 124 return RelaxInvalidSeqError(data, "the residue number data is missing") 125 126 # Bad data in column. 127 try: 128 res_num = eval(data[res_num_col-1]) 129 if not (res_num == None or isinstance(res_num, int)): 130 return RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 131 except: 132 return RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 133 134 # Residue name data. 135 if res_name_col: 136 if len(data) < res_name_col: 137 return RelaxInvalidSeqError(data, "the residue name data is missing") 138 139 # Spin number data. 140 if spin_num_col: 141 # No data in column. 142 if len(data) < spin_num_col: 143 return RelaxInvalidSeqError(data, "the spin number data is missing") 144 145 # Bad data in column. 146 try: 147 spin_num = eval(data[spin_num_col-1]) 148 if not (spin_num == None or isinstance(spin_num, int)): 149 return RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[spin_num_col-1]) 150 except: 151 return RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[spin_num_col-1]) 152 153 # Spin name data. 154 if spin_name_col: 155 if len(data) < spin_name_col: 156 return RelaxInvalidSeqError(data, "the spin name data is missing") 157 158 # Data. 159 if data_col: 160 if len(data) < data_col: 161 return RelaxInvalidSeqError(data, "the data is missing") 162 163 # Errors 164 if error_col: 165 if len(data) < error_col: 166 return RelaxInvalidSeqError(data, "the error data is missing")
167 168 # Create the checking object. 169 check_sequence = Check(check_sequence_func) 170 171
172 -def read_spin_data(file=None, dir=None, file_data=None, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None, sep=None, spin_id=None, raise_flag=True):
173 """Generator function for reading the spin specific data from file. 174 175 Description 176 =========== 177 178 This function reads a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 179 180 181 @keyword file: The name of the file to open. 182 @type file: str 183 @keyword dir: The directory containing the file (defaults to the current directory if None). 184 @type dir: str or None 185 @keyword file_data: An alternative to opening a file, if the data already exists in the correct format. The format is a list of lists where the first index corresponds to the row and the second the column. 186 @type file_data: list of lists 187 @keyword spin_id_col: The column containing the spin ID strings. If supplied, the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col arguments must be none. 188 @type spin_id_col: int or None 189 @keyword mol_name_col: The column containing the molecule name information. If supplied, spin_id_col must be None. 190 @type mol_name_col: int or None 191 @keyword res_name_col: The column containing the residue name information. If supplied, spin_id_col must be None. 192 @type res_name_col: int or None 193 @keyword res_num_col: The column containing the residue number information. If supplied, spin_id_col must be None. 194 @type res_num_col: int or None 195 @keyword spin_name_col: The column containing the spin name information. If supplied, spin_id_col must be None. 196 @type spin_name_col: int or None 197 @keyword spin_num_col: The column containing the spin number information. If supplied, spin_id_col must be None. 198 @type spin_num_col: int or None 199 @keyword data_col: The column containing the data. 200 @type data_col: int or None 201 @keyword error_col: The column containing the errors. 202 @type error_col: int or None 203 @keyword sep: The column separator which, if None, defaults to whitespace. 204 @type sep: str or None 205 @keyword spin_id: The spin ID string used to restrict data loading to a subset of all spins. 206 @type spin_id: None or str 207 @keyword raise_flag: A flag which if True will cause a RelaxError to be raised if no data can be found. 208 @type raise_flag: bool 209 @return: A list of the spin specific data is yielded. The format is a list consisting of the spin ID string, the data value (if data_col is give), and the error value (if error_col is given). If both data_col and error_col are None, then the spin ID string is simply yielded. 210 @rtype: str, list of [str, float], or list of [str, float, float] 211 """ 212 213 # Argument tests. 214 col_args = [spin_id_col, mol_name_col, res_name_col, res_num_col, spin_name_col, spin_num_col, data_col, error_col] 215 col_arg_names = ['spin_id_col', 'mol_name_col', 'res_name_col', 'res_num_col', 'spin_name_col', 'spin_num_col', 'data_col', 'error_col'] 216 for i in range(len(col_args)): 217 if col_args[i] == 0: 218 raise RelaxError("The '%s' argument cannot be zero, column numbering starts at one." % col_arg_names[i]) 219 if spin_id_col and (mol_name_col or res_name_col or res_num_col or spin_name_col or spin_num_col): 220 raise RelaxError("If the 'spin_id_col' argument has been supplied, then the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col must all be set to None.") 221 222 # Minimum number of columns. 223 min_col_num = max([_f for _f in [spin_id_col, mol_name_col, res_num_col, res_name_col, spin_num_col, spin_name_col, data_col, error_col] if _f]) 224 225 # Extract the data from the file. 226 if not file_data: 227 # Extract. 228 file_data = extract_data(file, dir, sep=sep) 229 230 # Strip the data of all comments and empty lines. 231 if spin_id_col != None: 232 file_data = strip(file_data, comments=False) 233 else: 234 file_data = strip(file_data) 235 236 # No data! 237 if not file_data: 238 warn(RelaxFileEmptyWarning(file)) 239 return 240 241 # Yield the data, spin by spin. 242 missing_data = True 243 for line in file_data: 244 # Convert the spin IDs. 245 if spin_id_col != None and line[spin_id_col-1][0] in ["\"", "\'"]: 246 line[spin_id_col-1] = eval(line[spin_id_col-1]) 247 248 # Convert. 249 # Validate the sequence. 250 if not check_sequence(line, spin_id_col=spin_id_col, mol_name_col=mol_name_col, res_num_col=res_num_col, res_name_col=res_name_col, spin_num_col=spin_num_col, spin_name_col=spin_name_col, data_col=data_col, error_col=error_col, escalate=1): 251 continue 252 253 # Get the spin data from the ID. 254 if spin_id_col: 255 # Invalid spin ID. 256 if line[spin_id_col-1] == '#': 257 warn(RelaxWarning("Invalid spin ID, skipping the line %s" % line)) 258 continue 259 260 mol_name, res_num, res_name, spin_num, spin_name = spin_id_to_data_list(line[spin_id_col-1]) 261 262 # Convert the spin data. 263 else: 264 # The molecule. 265 mol_name = None 266 if mol_name_col != None and line[mol_name_col-1] != 'None': 267 mol_name = line[mol_name_col-1] 268 269 # The residue number, catching bad values. 270 res_num = None 271 if res_num_col != None: 272 try: 273 if line[res_num_col-1] == 'None': 274 res_num = None 275 else: 276 res_num = int(line[res_num_col-1]) 277 except ValueError: 278 warn(RelaxWarning("Invalid residue number, skipping the line %s" % line)) 279 continue 280 281 # The residue name. 282 res_name = None 283 if res_name_col != None and line[res_name_col-1] != 'None': 284 res_name = line[res_name_col-1] 285 286 # The spin number, catching bad values. 287 spin_num = None 288 if spin_num_col != None: 289 try: 290 if line[spin_num_col-1] == 'None': 291 spin_num = None 292 else: 293 spin_num = int(line[spin_num_col-1]) 294 except ValueError: 295 warn(RelaxWarning("Invalid spin number, skipping the line %s" % line)) 296 continue 297 298 # The spin name. 299 spin_name = None 300 if spin_name_col != None and line[spin_name_col-1] != 'None': 301 spin_name = line[spin_name_col-1] 302 303 # Convert the data. 304 value = None 305 if data_col != None: 306 try: 307 # None. 308 if line[data_col-1] == 'None': 309 value = None 310 311 # A float. 312 else: 313 value = float(line[data_col-1]) 314 315 # If it a float, test if is nan. 316 if not isFinite(value): 317 warn(RelaxWarning("The value is not finite, skipping the line %s" % line)) 318 continue 319 320 # Bad data. 321 except ValueError: 322 warn(RelaxWarning("Invalid data, skipping the line %s" % line)) 323 continue 324 325 # Convert the errors. 326 error = None 327 if error_col != None: 328 try: 329 # None. 330 if line[error_col-1] == 'None': 331 error = None 332 333 # A float. 334 else: 335 error = float(line[error_col-1]) 336 337 # If it a float, test if is nan. 338 if not isFinite(error): 339 warn(RelaxWarning("The error is not finite, skipping the line %s" % line)) 340 continue 341 342 # Bad data. 343 except ValueError: 344 warn(RelaxWarning("Invalid errors, skipping the line %s" % line)) 345 continue 346 347 # Right, data is OK and exists. 348 missing_data = False 349 350 # Yield the data. 351 if data_col and error_col: 352 yield mol_name, res_num, res_name, spin_num, spin_name, value, error 353 elif data_col: 354 yield mol_name, res_num, res_name, spin_num, spin_name, value 355 elif error_col: 356 yield mol_name, res_num, res_name, spin_num, spin_name, error 357 else: 358 yield mol_name, res_num, res_name, spin_num, spin_name 359 360 # Hmmm, no data! 361 if raise_flag and missing_data: 362 raise RelaxError("No corresponding data could be found within the file.")
363 364
365 -def write_spin_data(file, dir=None, sep=None, spin_ids=None, mol_names=None, res_nums=None, res_names=None, spin_nums=None, spin_names=None, force=False, data=None, data_name=None, error=None, error_name=None, float_format="%20.15g"):
366 """Generator function for reading the spin specific data from file. 367 368 Description 369 =========== 370 371 This function writes a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 372 373 374 @param file: The name of the file to write the data to (or alternatively an already opened file object). 375 @type file: str or file object 376 @keyword dir: The directory to place the file into (defaults to the current directory if None and the file argument is not a file object). 377 @type dir: str or None 378 @keyword sep: The column separator which, if None, defaults to whitespace. 379 @type sep: str or None 380 @keyword spin_ids: The list of spin ID strings. 381 @type spin_ids: None or list of str 382 @keyword mol_names: The list of molecule names. 383 @type mol_names: None or list of str 384 @keyword res_nums: The list of residue numbers. 385 @type res_nums: None or list of int 386 @keyword res_names: The list of residue names. 387 @type res_names: None or list of str 388 @keyword spin_nums: The list of spin numbers. 389 @type spin_nums: None or list of int 390 @keyword spin_names: The list of spin names. 391 @type spin_names: None or list of str 392 @keyword force: A flag which if True will cause an existing file to be overwritten. 393 @type force: bool 394 @keyword data: A list of the data to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. 395 @type data: list or list of lists 396 @keyword data_name: A name corresponding to the data argument. If the data argument is a list of lists, then this must also be a list with the same length as the second dimension of the data arg. 397 @type data_name: str or list of str 398 @keyword error: A list of the errors to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. These will be inter-dispersed between the data columns, if the data is given. If the data arg is not None, then this must have the same dimensions as that object. 399 @type error: list or list of lists 400 @keyword error_name: A name corresponding to the error argument. If the error argument is a list of lists, then this must also be a list with the same length at the second dimension of the error arg. 401 @type error_name: str or list of str 402 @keyword float_format: A float formatting string to use for the data and error whenever a float is found. 403 @type float_format: str 404 """ 405 406 # Data argument tests. 407 if data: 408 # Data is a list of lists. 409 if isinstance(data[0], list): 410 # Data and data_name don't match. 411 if not isinstance(data_name, list): 412 raise RelaxError("The data_name arg '%s' must be a list as the data argument is a list of lists." % data_name) 413 414 # Error doesn't match. 415 if error and (len(data) != len(error) or len(data[0]) != len(error[0])): 416 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 417 418 # Data is a simple list. 419 else: 420 # Data and data_name don't match. 421 if not isinstance(data_name, str): 422 raise RelaxError("The data_name arg '%s' must be a string as the data argument is a simple list." % data_name) 423 424 # Error doesn't match. 425 if error and len(data) != len(error): 426 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 427 428 # Error argument tests. 429 if error: 430 # Error is a list of lists. 431 if isinstance(error[0], list): 432 # Error and error_name don't match. 433 if not isinstance(error_name, list): 434 raise RelaxError("The error_name arg '%s' must be a list as the error argument is a list of lists." % error_name) 435 436 # Error is a simple list. 437 else: 438 # Error and error_name don't match. 439 if not isinstance(error_name, str): 440 raise RelaxError("The error_name arg '%s' must be a string as the error argument is a simple list." % error_name) 441 442 # Number of spins check. 443 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 444 arg_names = ['spin_ids', 'mol_names', 'res_nums', 'res_names', 'spin_nums', 'spin_names'] 445 N = None 446 first_arg = None 447 first_arg_name = None 448 for i in range(len(args)): 449 if isinstance(args[i], list): 450 # First list match. 451 if N == None: 452 N = len(args[i]) 453 first_arg = args[i] 454 first_arg_name = arg_names[i] 455 456 # Length check. 457 if len(args[i]) != N: 458 raise RelaxError("The %s and %s arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, arg_names[i], len(first_arg), len(args[i]))) 459 460 # Nothing?!? 461 if N == None: 462 raise RelaxError("No spin ID data is present.") 463 464 # Data and error length check. 465 if data and len(data) != N: 466 raise RelaxError("The %s and data arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(data))) 467 if error and len(error) != N: 468 raise RelaxError("The %s and error arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(error))) 469 470 # The spin arguments. 471 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 472 arg_names = ['spin_id', 'mol_name', 'res_num', 'res_name', 'spin_num', 'spin_name'] 473 474 475 # Init. 476 headings = [] 477 file_data = [] 478 479 # Headers - the spin ID info. 480 for i in range(len(args)): 481 if args[i]: 482 headings.append(arg_names[i]) 483 484 # Headers - the data. 485 if data: 486 # List of lists. 487 if isinstance(data[0], list): 488 # Loop over the list. 489 for i in range(len(data[0])): 490 # The data. 491 headings.append(data_name[i]) 492 493 # The error. 494 if error: 495 headings.append(error_name[i]) 496 497 # Simple list. 498 else: 499 # The data. 500 headings.append(data_name) 501 502 # The error. 503 if error: 504 headings.append(error_name) 505 506 # Headers - only errors. 507 elif error: 508 # List of lists. 509 if isinstance(error[0], list): 510 for i in range(len(error[0])): 511 headings.append(error_name[i]) 512 513 # Simple list. 514 else: 515 headings.append(error_name) 516 517 # No headings. 518 if headings == []: 519 headings = None 520 521 # Spin specific data. 522 for spin_index in range(N): 523 # Append a new data row. 524 file_data.append([]) 525 526 # The spin ID info. 527 for i in range(len(args)): 528 if args[i]: 529 value = args[i][spin_index] 530 if not isinstance(value, str): 531 value = repr(value) 532 file_data[-1].append(value) 533 534 # The data. 535 if data: 536 # List of lists. 537 if isinstance(data[0], list): 538 # Loop over the list. 539 for i in range(len(data[0])): 540 # The data. 541 if is_float(data[spin_index][i]): 542 file_data[-1].append(float_format % data[spin_index][i]) 543 else: 544 file_data[-1].append(repr(data[spin_index][i])) 545 546 # The error. 547 if error: 548 if is_float(error[spin_index][i]): 549 file_data[-1].append(float_format % error[spin_index][i]) 550 else: 551 file_data[-1].append(repr(error[spin_index][i])) 552 553 # Simple list. 554 else: 555 # The data. 556 if is_float(data[spin_index]): 557 file_data[-1].append(float_format % data[spin_index]) 558 else: 559 file_data[-1].append(repr(data[spin_index])) 560 561 # The error. 562 if error: 563 if is_float(error[spin_index]): 564 file_data[-1].append(float_format % error[spin_index]) 565 else: 566 file_data[-1].append(repr(error[spin_index])) 567 568 # Only errors. 569 elif error: 570 # List of lists. 571 if isinstance(error[0], list): 572 for i in range(len(error[0])): 573 file_data[-1].append(repr(error[spin_index][i])) 574 575 # Simple list. 576 else: 577 file_data[-1].append(repr(error[spin_index])) 578 579 # No data to write, so do nothing! 580 if file_data == [] or file_data == [[]]: 581 return 582 583 # Open the file for writing. 584 file = open_write_file(file_name=file, dir=dir, force=force) 585 586 # Write out the file data. 587 write_data(out=file, headings=headings, data=file_data, sep=sep)
588