Package lib :: Module sequence
[hide private]
[frames] | no frames]

Source Code for Module lib.sequence

  1  from __future__ import absolute_import 
  2  ############################################################################### 
  3  #                                                                             # 
  4  # Copyright (C) 2003-2015 Edward d'Auvergne                                   # 
  5  #                                                                             # 
  6  # This file is part of the program relax (http://www.nmr-relax.com).          # 
  7  #                                                                             # 
  8  # This program is free software: you can redistribute it and/or modify        # 
  9  # it under the terms of the GNU General Public License as published by        # 
 10  # the Free Software Foundation, either version 3 of the License, or           # 
 11  # (at your option) any later version.                                         # 
 12  #                                                                             # 
 13  # This program is distributed in the hope that it will be useful,             # 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of              # 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               # 
 16  # GNU General Public License for more details.                                # 
 17  #                                                                             # 
 18  # You should have received a copy of the GNU General Public License           # 
 19  # along with this program.  If not, see <http://www.gnu.org/licenses/>.       # 
 20  #                                                                             # 
 21  ############################################################################### 
 22   
 23  # Module docstring. 
 24  """Module for handling the molecule, residue, and spin sequence data.""" 
 25   
 26  # Python module imports. 
 27  import sys 
 28  from warnings import warn 
 29   
 30  # relax module imports. 
 31  from lib.checks import Check 
 32  from lib.check_types import is_float 
 33  from lib.errors import RelaxError, RelaxInvalidSeqError 
 34  from lib.io import extract_data, open_write_file, strip, write_data 
 35  from lib.selection import spin_id_to_data_list 
 36  from lib.warnings import RelaxWarning, RelaxFileEmptyWarning 
 37   
 38   
 39  # The 3 letter to 1 letter amino acid code table. 
 40  AA_CODES = { 
 41      "ALA": "A", 
 42      "ARG": "R", 
 43      "ASN": "N", 
 44      "ASP": "D", 
 45      "CYS": "C", 
 46      "GLU": "E", 
 47      "GLN": "Q", 
 48      "GLY": "G", 
 49      "HIS": "H", 
 50      "ILE": "I", 
 51      "LEU": "L", 
 52      "LYS": "K", 
 53      "MET": "M", 
 54      "PHE": "F", 
 55      "PRO": "P", 
 56      "SER": "S", 
 57      "THR": "T", 
 58      "TRP": "W", 
 59      "TYR": "Y", 
 60      "VAL": "V", 
 61  } 
 62   
 63   
64 -def aa_codes_three_to_one(code):
65 """Convert the given three letter amino acid code to the corresponding one letter code. 66 67 Any non-standard residues will be converted to '*'. 68 69 70 @param code: The three letter amino acid code to convert. 71 @type code: str 72 @return: The corresponding one letter amino acid code, or '*'. 73 @rtype: str 74 """ 75 76 # Convert to uppercase. 77 upper_code = code.upper() 78 79 # The code exists. 80 if upper_code in AA_CODES: 81 return AA_CODES[upper_code] 82 83 # No code. 84 return '*'
85 86
87 -def check_sequence_func(data, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None):
88 """Test if the sequence data is valid. 89 90 The only function this performs is to raise a RelaxError if the data is invalid. 91 92 93 @param data: The sequence data. 94 @type data: list of lists. 95 @keyword spin_id_col: The column containing the spin ID strings. 96 @type spin_id_col: int or None 97 @param mol_name_col: The column containing the molecule name information. 98 @type mol_name_col: int or None 99 @param res_name_col: The column containing the residue name information. 100 @type res_name_col: int or None 101 @param res_num_col: The column containing the residue number information. 102 @type res_num_col: int or None 103 @param spin_name_col: The column containing the spin name information. 104 @type spin_name_col: int or None 105 @param spin_num_col: The column containing the spin number information. 106 @type spin_num_col: int or None 107 """ 108 109 # Spin ID. 110 if spin_id_col: 111 if len(data) < spin_id_col: 112 return RelaxInvalidSeqError(data, "the Spin ID data is missing") 113 114 # Molecule name data. 115 if mol_name_col: 116 if len(data) < mol_name_col: 117 return RelaxInvalidSeqError(data, "the molecule name data is missing") 118 119 # Residue number data. 120 if res_num_col: 121 # No data in column. 122 if len(data) < res_num_col: 123 return RelaxInvalidSeqError(data, "the residue number data is missing") 124 125 # Bad data in column. 126 try: 127 res_num = eval(data[res_num_col-1]) 128 if not (res_num == None or isinstance(res_num, int)): 129 return RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 130 except: 131 return RelaxInvalidSeqError(data, "the residue number data '%s' is invalid" % data[res_num_col-1]) 132 133 # Residue name data. 134 if res_name_col: 135 if len(data) < res_name_col: 136 return RelaxInvalidSeqError(data, "the residue name data is missing") 137 138 # Spin number data. 139 if spin_num_col: 140 # No data in column. 141 if len(data) < spin_num_col: 142 return RelaxInvalidSeqError(data, "the spin number data is missing") 143 144 # Bad data in column. 145 try: 146 spin_num = eval(data[spin_num_col-1]) 147 if not (spin_num == None or isinstance(spin_num, int)): 148 return RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[spin_num_col-1]) 149 except: 150 return RelaxInvalidSeqError(data, "the spin number data '%s' is invalid" % data[spin_num_col-1]) 151 152 # Spin name data. 153 if spin_name_col: 154 if len(data) < spin_name_col: 155 return RelaxInvalidSeqError(data, "the spin name data is missing") 156 157 # Data. 158 if data_col: 159 if len(data) < data_col: 160 return RelaxInvalidSeqError(data, "the data is missing") 161 162 # Errors 163 if error_col: 164 if len(data) < error_col: 165 return RelaxInvalidSeqError(data, "the error data is missing")
166 167 # Create the checking object. 168 check_sequence = Check(check_sequence_func) 169 170
171 -def read_spin_data(file=None, dir=None, file_data=None, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, error_col=None, sep=None, spin_id=None, raise_flag=True):
172 """Generator function for reading the spin specific data from file. 173 174 Description 175 =========== 176 177 This function reads a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 178 179 180 @keyword file: The name of the file to open. 181 @type file: str 182 @keyword dir: The directory containing the file (defaults to the current directory if None). 183 @type dir: str or None 184 @keyword file_data: An alternative to opening a file, if the data already exists in the correct format. The format is a list of lists where the first index corresponds to the row and the second the column. 185 @type file_data: list of lists 186 @keyword spin_id_col: The column containing the spin ID strings. If supplied, the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col arguments must be none. 187 @type spin_id_col: int or None 188 @keyword mol_name_col: The column containing the molecule name information. If supplied, spin_id_col must be None. 189 @type mol_name_col: int or None 190 @keyword res_name_col: The column containing the residue name information. If supplied, spin_id_col must be None. 191 @type res_name_col: int or None 192 @keyword res_num_col: The column containing the residue number information. If supplied, spin_id_col must be None. 193 @type res_num_col: int or None 194 @keyword spin_name_col: The column containing the spin name information. If supplied, spin_id_col must be None. 195 @type spin_name_col: int or None 196 @keyword spin_num_col: The column containing the spin number information. If supplied, spin_id_col must be None. 197 @type spin_num_col: int or None 198 @keyword data_col: The column containing the data. 199 @type data_col: int or None 200 @keyword error_col: The column containing the errors. 201 @type error_col: int or None 202 @keyword sep: The column separator which, if None, defaults to whitespace. 203 @type sep: str or None 204 @keyword spin_id: The spin ID string used to restrict data loading to a subset of all spins. 205 @type spin_id: None or str 206 @keyword raise_flag: A flag which if True will cause a RelaxError to be raised if no data can be found. 207 @type raise_flag: bool 208 @return: A list of the spin specific data is yielded. The format is a list consisting of the spin ID string, the data value (if data_col is give), and the error value (if error_col is given). If both data_col and error_col are None, then the spin ID string is simply yielded. 209 @rtype: str, list of [str, float], or list of [str, float, float] 210 """ 211 212 # Argument tests. 213 col_args = [spin_id_col, mol_name_col, res_name_col, res_num_col, spin_name_col, spin_num_col, data_col, error_col] 214 col_arg_names = ['spin_id_col', 'mol_name_col', 'res_name_col', 'res_num_col', 'spin_name_col', 'spin_num_col', 'data_col', 'error_col'] 215 for i in range(len(col_args)): 216 if col_args[i] == 0: 217 raise RelaxError("The '%s' argument cannot be zero, column numbering starts at one." % col_arg_names[i]) 218 if spin_id_col and (mol_name_col or res_name_col or res_num_col or spin_name_col or spin_num_col): 219 raise RelaxError("If the 'spin_id_col' argument has been supplied, then the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col must all be set to None.") 220 221 # Minimum number of columns. 222 min_col_num = max([_f for _f in [spin_id_col, mol_name_col, res_num_col, res_name_col, spin_num_col, spin_name_col, data_col, error_col] if _f]) 223 224 # Extract the data from the file. 225 if not file_data: 226 # Extract. 227 file_data = extract_data(file, dir, sep=sep) 228 229 # Strip the data of all comments and empty lines. 230 if spin_id_col != None: 231 file_data = strip(file_data, comments=False) 232 else: 233 file_data = strip(file_data) 234 235 # No data! 236 if not file_data: 237 warn(RelaxFileEmptyWarning(file)) 238 return 239 240 # Yield the data, spin by spin. 241 missing_data = True 242 for line in file_data: 243 # Convert the spin IDs. 244 if spin_id_col != None and line[spin_id_col-1][0] in ["\"", "\'"]: 245 line[spin_id_col-1] = eval(line[spin_id_col-1]) 246 247 # Convert. 248 # Validate the sequence. 249 if not check_sequence(line, spin_id_col=spin_id_col, mol_name_col=mol_name_col, res_num_col=res_num_col, res_name_col=res_name_col, spin_num_col=spin_num_col, spin_name_col=spin_name_col, data_col=data_col, error_col=error_col, escalate=1): 250 continue 251 252 # Get the spin data from the ID. 253 if spin_id_col: 254 # Invalid spin ID. 255 if line[spin_id_col-1] == '#': 256 warn(RelaxWarning("Invalid spin ID, skipping the line %s" % line)) 257 continue 258 259 mol_name, res_num, res_name, spin_num, spin_name = spin_id_to_data_list(line[spin_id_col-1]) 260 261 # Convert the spin data. 262 else: 263 # The molecule. 264 mol_name = None 265 if mol_name_col != None and line[mol_name_col-1] != 'None': 266 mol_name = line[mol_name_col-1] 267 268 # The residue number, catching bad values. 269 res_num = None 270 if res_num_col != None: 271 try: 272 if line[res_num_col-1] == 'None': 273 res_num = None 274 else: 275 res_num = int(line[res_num_col-1]) 276 except ValueError: 277 warn(RelaxWarning("Invalid residue number, skipping the line %s" % line)) 278 continue 279 280 # The residue name. 281 res_name = None 282 if res_name_col != None and line[res_name_col-1] != 'None': 283 res_name = line[res_name_col-1] 284 285 # The spin number, catching bad values. 286 spin_num = None 287 if spin_num_col != None: 288 try: 289 if line[spin_num_col-1] == 'None': 290 spin_num = None 291 else: 292 spin_num = int(line[spin_num_col-1]) 293 except ValueError: 294 warn(RelaxWarning("Invalid spin number, skipping the line %s" % line)) 295 continue 296 297 # The spin name. 298 spin_name = None 299 if spin_name_col != None and line[spin_name_col-1] != 'None': 300 spin_name = line[spin_name_col-1] 301 302 # Convert the data. 303 value = None 304 if data_col != None: 305 try: 306 # None. 307 if line[data_col-1] == 'None': 308 value = None 309 310 # A float. 311 else: 312 value = float(line[data_col-1]) 313 314 # Bad data. 315 except ValueError: 316 warn(RelaxWarning("Invalid data, skipping the line %s" % line)) 317 continue 318 319 # Convert the errors. 320 error = None 321 if error_col != None: 322 try: 323 # None. 324 if line[error_col-1] == 'None': 325 error = None 326 327 # A float. 328 else: 329 error = float(line[error_col-1]) 330 331 # Bad data. 332 except ValueError: 333 warn(RelaxWarning("Invalid errors, skipping the line %s" % line)) 334 continue 335 336 # Right, data is OK and exists. 337 missing_data = False 338 339 # Yield the data. 340 if data_col and error_col: 341 yield mol_name, res_num, res_name, spin_num, spin_name, value, error 342 elif data_col: 343 yield mol_name, res_num, res_name, spin_num, spin_name, value 344 elif error_col: 345 yield mol_name, res_num, res_name, spin_num, spin_name, error 346 else: 347 yield mol_name, res_num, res_name, spin_num, spin_name 348 349 # Hmmm, no data! 350 if raise_flag and missing_data: 351 raise RelaxError("No corresponding data could be found within the file.")
352 353
354 -def write_spin_data(file, dir=None, sep=None, spin_ids=None, mol_names=None, res_nums=None, res_names=None, spin_nums=None, spin_names=None, force=False, data=None, data_name=None, error=None, error_name=None, float_format="%20.15g"):
355 """Generator function for reading the spin specific data from file. 356 357 Description 358 =========== 359 360 This function writes a columnar formatted file where each line corresponds to a spin system. Spin identification is either through a spin ID string or through columns containing the molecule name, residue name and number, and/or spin name and number. 361 362 363 @param file: The name of the file to write the data to (or alternatively an already opened file object). 364 @type file: str or file object 365 @keyword dir: The directory to place the file into (defaults to the current directory if None and the file argument is not a file object). 366 @type dir: str or None 367 @keyword sep: The column separator which, if None, defaults to whitespace. 368 @type sep: str or None 369 @keyword spin_ids: The list of spin ID strings. 370 @type spin_ids: None or list of str 371 @keyword mol_names: The list of molecule names. 372 @type mol_names: None or list of str 373 @keyword res_nums: The list of residue numbers. 374 @type res_nums: None or list of int 375 @keyword res_names: The list of residue names. 376 @type res_names: None or list of str 377 @keyword spin_nums: The list of spin numbers. 378 @type spin_nums: None or list of int 379 @keyword spin_names: The list of spin names. 380 @type spin_names: None or list of str 381 @keyword force: A flag which if True will cause an existing file to be overwritten. 382 @type force: bool 383 @keyword data: A list of the data to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. 384 @type data: list or list of lists 385 @keyword data_name: A name corresponding to the data argument. If the data argument is a list of lists, then this must also be a list with the same length as the second dimension of the data arg. 386 @type data_name: str or list of str 387 @keyword error: A list of the errors to write out. The first dimension corresponds to the spins. A second dimension can also be given if multiple data sets across multiple columns are desired. These will be inter-dispersed between the data columns, if the data is given. If the data arg is not None, then this must have the same dimensions as that object. 388 @type error: list or list of lists 389 @keyword error_name: A name corresponding to the error argument. If the error argument is a list of lists, then this must also be a list with the same length at the second dimension of the error arg. 390 @type error_name: str or list of str 391 @keyword float_format: A float formatting string to use for the data and error whenever a float is found. 392 @type float_format: str 393 """ 394 395 # Data argument tests. 396 if data: 397 # Data is a list of lists. 398 if isinstance(data[0], list): 399 # Data and data_name don't match. 400 if not isinstance(data_name, list): 401 raise RelaxError("The data_name arg '%s' must be a list as the data argument is a list of lists." % data_name) 402 403 # Error doesn't match. 404 if error and (len(data) != len(error) or len(data[0]) != len(error[0])): 405 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 406 407 # Data is a simple list. 408 else: 409 # Data and data_name don't match. 410 if not isinstance(data_name, str): 411 raise RelaxError("The data_name arg '%s' must be a string as the data argument is a simple list." % data_name) 412 413 # Error doesn't match. 414 if error and len(data) != len(error): 415 raise RelaxError("The data arg:\n%s\n\ndoes not have the same dimensions as the error arg:\n%s." % (data, error)) 416 417 # Error argument tests. 418 if error: 419 # Error is a list of lists. 420 if isinstance(error[0], list): 421 # Error and error_name don't match. 422 if not isinstance(error_name, list): 423 raise RelaxError("The error_name arg '%s' must be a list as the error argument is a list of lists." % error_name) 424 425 # Error is a simple list. 426 else: 427 # Error and error_name don't match. 428 if not isinstance(error_name, str): 429 raise RelaxError("The error_name arg '%s' must be a string as the error argument is a simple list." % error_name) 430 431 # Number of spins check. 432 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 433 arg_names = ['spin_ids', 'mol_names', 'res_nums', 'res_names', 'spin_nums', 'spin_names'] 434 N = None 435 first_arg = None 436 first_arg_name = None 437 for i in range(len(args)): 438 if isinstance(args[i], list): 439 # First list match. 440 if N == None: 441 N = len(args[i]) 442 first_arg = args[i] 443 first_arg_name = arg_names[i] 444 445 # Length check. 446 if len(args[i]) != N: 447 raise RelaxError("The %s and %s arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, arg_names[i], len(first_arg), len(args[i]))) 448 449 # Nothing?!? 450 if N == None: 451 raise RelaxError("No spin ID data is present.") 452 453 # Data and error length check. 454 if data and len(data) != N: 455 raise RelaxError("The %s and data arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(data))) 456 if error and len(error) != N: 457 raise RelaxError("The %s and error arguments do not have the same number of spins ('%s' vs. '%s' respectively)." % (first_arg_name, len(first_arg), len(error))) 458 459 # The spin arguments. 460 args = [spin_ids, mol_names, res_nums, res_names, spin_nums, spin_names] 461 arg_names = ['spin_id', 'mol_name', 'res_num', 'res_name', 'spin_num', 'spin_name'] 462 463 464 # Init. 465 headings = [] 466 file_data = [] 467 468 # Headers - the spin ID info. 469 for i in range(len(args)): 470 if args[i]: 471 headings.append(arg_names[i]) 472 473 # Headers - the data. 474 if data: 475 # List of lists. 476 if isinstance(data[0], list): 477 # Loop over the list. 478 for i in range(len(data[0])): 479 # The data. 480 headings.append(data_name[i]) 481 482 # The error. 483 if error: 484 headings.append(error_name[i]) 485 486 # Simple list. 487 else: 488 # The data. 489 headings.append(data_name) 490 491 # The error. 492 if error: 493 headings.append(error_name) 494 495 # Headers - only errors. 496 elif error: 497 # List of lists. 498 if isinstance(error[0], list): 499 for i in range(len(error[0])): 500 headings.append(error_name[i]) 501 502 # Simple list. 503 else: 504 headings.append(error_name) 505 506 # No headings. 507 if headings == []: 508 headings = None 509 510 # Spin specific data. 511 for spin_index in range(N): 512 # Append a new data row. 513 file_data.append([]) 514 515 # The spin ID info. 516 for i in range(len(args)): 517 if args[i]: 518 value = args[i][spin_index] 519 if not isinstance(value, str): 520 value = repr(value) 521 file_data[-1].append(value) 522 523 # The data. 524 if data: 525 # List of lists. 526 if isinstance(data[0], list): 527 # Loop over the list. 528 for i in range(len(data[0])): 529 # The data. 530 if is_float(data[spin_index][i]): 531 file_data[-1].append(float_format % data[spin_index][i]) 532 else: 533 file_data[-1].append(repr(data[spin_index][i])) 534 535 # The error. 536 if error: 537 if is_float(error[spin_index][i]): 538 file_data[-1].append(float_format % error[spin_index][i]) 539 else: 540 file_data[-1].append(repr(error[spin_index][i])) 541 542 # Simple list. 543 else: 544 # The data. 545 if is_float(data[spin_index]): 546 file_data[-1].append(float_format % data[spin_index]) 547 else: 548 file_data[-1].append(repr(data[spin_index])) 549 550 # The error. 551 if error: 552 if is_float(error[spin_index]): 553 file_data[-1].append(float_format % error[spin_index]) 554 else: 555 file_data[-1].append(repr(error[spin_index])) 556 557 # Only errors. 558 elif error: 559 # List of lists. 560 if isinstance(error[0], list): 561 for i in range(len(error[0])): 562 file_data[-1].append(repr(error[spin_index][i])) 563 564 # Simple list. 565 else: 566 file_data[-1].append(repr(error[spin_index])) 567 568 # No data to write, so do nothing! 569 if file_data == [] or file_data == [[]]: 570 return 571 572 # Open the file for writing. 573 file = open_write_file(file_name=file, dir=dir, force=force) 574 575 # Write out the file data. 576 write_data(out=file, headings=headings, data=file_data, sep=sep)
577