generic_fns.spectrum

1 ############################################################################### 2 # # 3 # Copyright (C) 2004-2012 Edward d'Auvergne # 4 # Copyright (C) 2008 Sebastien Morin # 5 # # 6 # This file is part of the program relax. # 7 # # 8 # relax is free software; you can redistribute it and/or modify # 9 # it under the terms of the GNU General Public License as published by # 10 # the Free Software Foundation; either version 2 of the License, or # 11 # (at your option) any later version. # 12 # # 13 # relax is distributed in the hope that it will be useful, # 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 16 # GNU General Public License for more details. # 17 # # 18 # You should have received a copy of the GNU General Public License # 19 # along with relax; if not, write to the Free Software # 20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # 21 # # 22 ############################################################################### 23 24 # Module docstring. 25 """Module containing functions for the handling of peak intensities.""" 26 27 28 # Python module imports. 29 from math import sqrt 30 from re import split 31 import string 32 from warnings import warn 33 34 # relax module imports. 35 from generic_fns.mol_res_spin import exists_mol_res_spin_data, generate_spin_id, generate_spin_id_data_array, return_spin, spin_loop 36 from generic_fns import pipes 37 from relax_errors import RelaxArgNotNoneError, RelaxError, RelaxImplementError, RelaxNoSequenceError, RelaxNoSpectraError 38 from relax_io import extract_data, read_spin_data, strip 39 from relax_warnings import RelaxWarning, RelaxNoSpinWarning 40 41 42

43 -class Bruker_import:

44 - def __init__(self, dir=None, exp_type=None, file=None, UI='prompt', output_file=None):

45 """Function to import Bruker Protein Dynamic Center (PDC) files. 46 47 @param dir: The directory to save the new file in. 48 @type dir: str 49 @param file: The Bruker PDC output file. 50 @type file: str 51 @param exp_type: The type of experiment, e.g. NOE, T1 or T2 52 @type exp_type: str 53 @param UI: The relax user interface (either 'prompt' or 'GUI'). 54 @type UI: str 55 @param output_file: The file to save the imported list. 56 @type output_file: str 57 """ 58 59 # Create experiment type 60 self.exp_type = exp_type 61 62 # The output file 63 self.output_file = output_file 64 self.dir = dir 65 66 # The user interface. 67 self.UI = UI 68 69 # Read the file. 70 if file: 71 self.read_file(file) 72 else: 73 raise RelaxError('No file selected.') 74 75 # Collect the entries for the file. 76 self.collect_entries() 77 78 # Create dummy file. 79 self.create_file()

80 81

82 - def __str__(self):

83 """Function to allow to return a value.""" 84 return str(self.value)

85 86

87 - def collect_entries(self):

88 """Function to collect entries for the NOE/R1/R2 relax dummy file.""" 89 90 # storage for file entries 91 # [ [Mol_name, Res_num, Res_name, Spin_num, value, error], ... ] 92 self.file_entries = [] 93 94 # Data flag 95 is_data = False 96 97 # Spin name 98 spinname = 'N' 99 100 # loop over line 101 for line in range(0, len(self.entries)): 102 # Detect the experiment 103 if 'Project:' in self.entries[line][0]: 104 exp_type = '' 105 106 # NOE 107 if 'Dynamic method/Hetero NOE' in self.entries[line][1]: 108 exp_type = 'NOE' 109 # T1 110 elif 'Dynamic method/T1' in self.entries[line][1]: 111 exp_type = 'T1' 112 # T2 113 elif 'Dynamic method/T2' in self.entries[line][1]: 114 exp_type = 'T2' 115 116 # Check agreement of file and user input. 117 if self.exp_type: 118 if not self.exp_type == exp_type: 119 raise RelaxError('Selected file is not a '+self.exp_type+'-file.') 120 return 121 122 # Feedback 123 print "Reading BRUKER PDC "+exp_type+" file.\n" 124 125 # The entries 126 if 'SECTION:' in self.entries[line][0]: 127 # NOE/T1/T2 results 128 if 'results' in self.entries[line][1]: 129 is_data = True 130 continue 131 132 # Other entries 133 else: 134 is_data = False 135 136 # Spin name 137 if 'Labelling:' in self.entries[line][0]: 138 # 15N 139 if 'N' in self.entries[line][0]: 140 spinname = 'N' 141 142 # 13C 143 if 'C' in self.entries[line][0]: 144 spinname = 'C' 145 146 # Collect NOE/T1/T2 values 147 if is_data: 148 # Exclued header or blank line. 149 if self.entries[line][0] in ['Peak name', '']: 150 continue 151 152 # Label 153 label_tmp = self.entries[line][0] 154 label_tmp = label_tmp.replace(' ', '') 155 label_tmp = label_tmp.replace('[', '') 156 label_tmp = label_tmp.replace(']', '') 157 158 # Detect residue number 159 resnum = label_tmp 160 # find start of number 161 start = 0 162 while resnum[start].isdigit()==False: start = start+1 163 164 # find end of number 165 end = start 166 try: 167 while resnum[end].isdigit()==True: end = end+1 168 # Label ends with number 169 except: 170 end = end 171 172 # cut out residue number integer 173 resnum = resnum[start:end] 174 175 # Residue name 176 resname = label_tmp[0:start] 177 178 # Spin num 179 spin_no = line 180 181 # The value 182 value_tmp = float(self.entries[line][3]) 183 184 # Convert T1/T2 to R1/R2 185 if exp_type in ['T1', 'T2']: 186 value_tmp = 1.0/value_tmp 187 188 # error 189 error = float(self.entries[line][4]) 190 191 # Store file entry 192 self.file_entries.append(['Bruker_PDC_'+exp_type, resnum, resname, spin_no, spinname, value_tmp, error])

193 194

195 - def create_file(self):

196 """Function to write the file.""" 197 198 # The Sparky header 199 text = 'Mol_name\tRes_num\tRes_name\tSpin_num\tSpin_name\tValue\tError \n' 200 201 for line in range(0, len(self.file_entries)): 202 # Build entries. 203 tmp_text = '' 204 for i in range(0, len(self.file_entries[line])): 205 tmp_text = tmp_text + str(self.file_entries[line][i])+'\t' 206 207 # Add entries. 208 text = text+tmp_text + '\n' 209 210 # Print entries 211 if not self.UI == 'GUI': 212 print text 213 214 # craete output file 215 if self.output_file: 216 if self.dir: 217 file = open(self.dir+sep+self.output_file, 'w') 218 else: 219 file = open(self.output_file, 'w') 220 # create dummy file 221 else: 222 file = DummyFileObject() 223 224 # Write the file 225 file.write(text) 226 file.close() 227 228 # Feedback 229 if self.output_file: 230 if self.dir: 231 print 'Created BRUKER PDC file '+self.dir+sep+self.output_file 232 # The return value 233 self.value = self.dir+sep+self.output_file 234 else: 235 print 'Created BRUKER PDC file '+self.output_file 236 # The return value 237 self.value = self.output_file 238 else: 239 print 'Created BRUKER PDC file.' 240 # Return the dummy file 241 self.value = file

242 243

244 - def read_file(self, filename):

245 """Function to read the file.""" 246 247 # Open the file. 248 file = open(filename, 'r') 249 250 # Storage of lines. 251 self.entries = [] 252 253 # Loop over line in PDC file 254 for line in file: 255 # Read entries in line 256 entry = line 257 entry = line.strip() 258 entry = entry.split('\t') 259 260 # Add entries to storage 261 self.entries.append(entry) 262 263 # close the file 264 file.close()

265 266

267 -def __errors_height_no_repl():

268 """Calculate the errors for peak heights when no spectra are replicated.""" 269 270 # Loop over the spins and set the error to the RMSD of the base plane noise. 271 for spin, spin_id in spin_loop(return_id=True): 272 # Skip deselected spins. 273 if not spin.select: 274 continue 275 276 # Skip spins missing intensity data. 277 if not hasattr(spin, 'intensities'): 278 continue 279 280 # Test if the RMSD has been set. 281 if not hasattr(spin, 'baseplane_rmsd'): 282 raise RelaxError("The RMSD of the base plane noise for spin '%s' has not been set." % spin_id) 283 284 # Set the error to the RMSD. 285 spin.intensity_err = spin.baseplane_rmsd

286 287

288 -def __errors_repl(verbosity=0):

289 """Calculate the errors for peak intensities from replicated spectra. 290 291 @keyword verbosity: The amount of information to print. The higher the value, the greater the verbosity. 292 @type verbosity: int 293 """ 294 295 # Replicated spectra. 296 repl = replicated_flags() 297 298 # Are all spectra replicated? 299 if False in repl.values(): 300 all_repl = False 301 print("All spectra replicated: No.") 302 else: 303 all_repl = True 304 print("All spectra replicated: Yes.") 305 306 # Initialise. 307 cdp.sigma_I = {} 308 cdp.var_I = {} 309 310 # Loop over the spectra. 311 for id in cdp.spectrum_ids: 312 # Skip non-replicated spectra. 313 if not repl[id]: 314 continue 315 316 # Skip replicated spectra which already have been used. 317 if cdp.var_I.has_key(id) and cdp.var_I[id] != 0.0: 318 continue 319 320 # The replicated spectra. 321 for j in xrange(len(cdp.replicates)): 322 if id in cdp.replicates[j]: 323 spectra = cdp.replicates[j] 324 325 # Number of spectra. 326 num_spectra = len(spectra) 327 328 # Print out. 329 print(("\nReplicated spectra: " + repr(spectra))) 330 if verbosity: 331 print(("%-5s%-6s%-20s%-20s" % ("Num", "Name", "Average", "SD"))) 332 333 # Calculate the mean value. 334 count = 0 335 for spin in spin_loop(): 336 # Skip deselected spins. 337 if not spin.select: 338 continue 339 340 # Skip and deselect spins which have no data. 341 if not hasattr(spin, 'intensities'): 342 spin.select = False 343 continue 344 345 # Missing data. 346 missing = False 347 for j in xrange(num_spectra): 348 if not spin.intensities.has_key(spectra[j]): 349 missing = True 350 if missing: 351 continue 352 353 # Average intensity. 354 ave_intensity = 0.0 355 for j in xrange(num_spectra): 356 ave_intensity = ave_intensity + spin.intensities[spectra[j]] 357 ave_intensity = ave_intensity / num_spectra 358 359 # Sum of squared errors. 360 SSE = 0.0 361 for j in xrange(num_spectra): 362 SSE = SSE + (spin.intensities[spectra[j]] - ave_intensity) ** 2 363 364 # Variance. 365 # 366 # 1 367 # sigma^2 = ----- * sum({Xi - Xav}^2)] 368 # n - 1 369 # 370 var_I = 1.0 / (num_spectra - 1.0) * SSE 371 372 # Print out. 373 if verbosity: 374 print(("%-5i%-6s%-20s%-20s" % (spin.num, spin.name, repr(ave_intensity), repr(var_I)))) 375 376 # Sum of variances (for average). 377 if not cdp.var_I.has_key(id): 378 cdp.var_I[id] = 0.0 379 cdp.var_I[id] = cdp.var_I[id] + var_I 380 count = count + 1 381 382 # No data catch. 383 if not count: 384 raise RelaxError("No data is present, unable to calculate errors from replicated spectra.") 385 386 # Average variance. 387 cdp.var_I[id] = cdp.var_I[id] / float(count) 388 389 # Set all spectra variances. 390 for j in range(num_spectra): 391 cdp.var_I[spectra[j]] = cdp.var_I[id] 392 393 # Print out. 394 print(("Standard deviation: %s" % sqrt(cdp.var_I[id]))) 395 396 397 # Average across all spectra if there are time points with a single spectrum. 398 if not all_repl: 399 # Print out. 400 print("\nVariance averaging over all spectra.") 401 402 # Initialise. 403 var_I = 0.0 404 num_dups = 0 405 406 # Loop over all time points. 407 for id in cdp.var_I.keys(): 408 # Single spectrum (or extraordinarily accurate NMR spectra!). 409 if cdp.var_I[id] == 0.0: 410 continue 411 412 # Sum and count. 413 var_I = var_I + cdp.var_I[id] 414 num_dups = num_dups + 1 415 416 # Average value. 417 var_I = var_I / float(num_dups) 418 419 # Assign the average value to all time points. 420 for id in cdp.spectrum_ids: 421 cdp.var_I[id] = var_I 422 423 # Print out. 424 print(("Standard deviation for all spins: " + repr(sqrt(var_I)))) 425 426 # Loop over the spectra. 427 for id in cdp.var_I.keys(): 428 # Create the standard deviation data structure. 429 cdp.sigma_I[id] = sqrt(cdp.var_I[id]) 430 431 # Set the spin specific errors. 432 for spin in spin_loop(): 433 # Skip deselected spins. 434 if not spin.select: 435 continue 436 437 # Set the error. 438 spin.intensity_err = cdp.sigma_I

439 440

441 -def __errors_volume_no_repl():

442 """Calculate the errors for peak volumes when no spectra are replicated.""" 443 444 # Loop over the spins and set the error to the RMSD of the base plane noise. 445 for spin, spin_id in spin_loop(return_id=True): 446 # Skip deselected spins. 447 if not spin.select: 448 continue 449 450 # Skip spins missing intensity data. 451 if not hasattr(spin, 'intensities'): 452 continue 453 454 # Test if the RMSD has been set. 455 if not hasattr(spin, 'baseplane_rmsd'): 456 raise RelaxError("The RMSD of the base plane noise for spin '%s' has not been set." % spin_id) 457 458 # Test that the total number of points have been set. 459 if not hasattr(spin, 'N'): 460 raise RelaxError("The total number of points used in the volume integration has not been specified for spin '%s'." % spin_id) 461 462 # Set the error to the RMSD multiplied by the square root of the total number of points. 463 for key in spin.intensity.keys(): 464 spin.intensity_err[key] = spin.baseplane_rmsd[key] * sqrt(spin.N)

465 466

467 -def autodetect_format(file_data):

468 """Automatically detect the format of the peak list. 469 470 @param file_data: The processed results file data. 471 @type file_data: list of lists of str 472 """ 473 474 # The first header line. 475 for line in file_data: 476 if line != []: 477 break 478 479 # Sparky format. 480 if line[0] == 'Assignment': 481 return 'sparky' 482 483 # NMRView format. 484 if line == ['label', 'dataset', 'sw', 'sf']: 485 return 'nmrview' 486 487 # XEasy format. 488 if line == ['No.', 'Color', 'w1', 'w2', 'ass.', 'in', 'w1', 'ass.', 'in', 'w2', 'Volume', 'Vol.', 'Err.', 'Method', 'Comment']: 489 return 'xeasy' 490 491 # Assume a generic format. 492 return 'generic'

493 494

495 -def baseplane_rmsd(error=0.0, spectrum_id=None, spin_id=None):

496 """Set the peak intensity errors, as defined as the baseplane RMSD. 497 498 @param error: The peak intensity error value defined as the RMSD of the base plane 499 noise. 500 @type error: float 501 @keyword spectrum_id: The spectrum id. 502 @type spectrum_id: str 503 @param spin_id: The spin identification string. 504 @type spin_id: str 505 """ 506 507 # Test if the current pipe exists 508 pipes.test() 509 510 # Test if the sequence data is loaded. 511 if not exists_mol_res_spin_data(): 512 raise RelaxNoSequenceError 513 514 # Test the spectrum id string. 515 if spectrum_id not in cdp.spectrum_ids: 516 raise RelaxError("The peak intensities corresponding to the spectrum id '%s' do not exist." % spectrum_id) 517 518 # The scaling by NC_proc. 519 if hasattr(cdp, 'ncproc') and spectrum_id in cdp.ncproc: 520 scale = 1.0 / 2**cdp.ncproc[spectrum_id] 521 else: 522 scale = 1.0 523 524 # Loop over the spins. 525 for spin in spin_loop(spin_id): 526 # Skip deselected spins. 527 if not spin.select: 528 continue 529 530 # Initialise or update the baseplane_rmsd data structure as necessary. 531 if not hasattr(spin, 'baseplane_rmsd'): 532 spin.baseplane_rmsd = {} 533 534 # Set the error. 535 spin.baseplane_rmsd[spectrum_id] = float(error) * scale

536 537

538 -def delete(spectrum_id=None):

539 """Delete spectral data corresponding to the spectrum ID. 540 541 @keyword spectrum_id: The spectrum ID string. 542 @type spectrum_id: str 543 """ 544 545 # Test if the current pipe exists. 546 pipes.test() 547 548 # Test if the sequence data is loaded. 549 if not exists_mol_res_spin_data(): 550 raise RelaxNoSequenceError 551 552 # Test if data exists. 553 if not hasattr(cdp, 'spectrum_ids') or spectrum_id not in cdp.spectrum_ids: 554 raise RelaxNoSpectraError(spectrum_id) 555 556 # Remove the ID. 557 cdp.spectrum_ids.pop(cdp.spectrum_ids.index(spectrum_id)) 558 559 # The ncproc parameter. 560 if hasattr(cdp, 'ncproc') and cdp.ncproc.has_key(spectrum_id): 561 del cdp.ncproc[spectrum_id] 562 563 # Replicates. 564 if hasattr(cdp, 'replicates'): 565 # Loop over the replicates. 566 for i in range(len(cdp.replicates)): 567 # The spectrum is replicated. 568 if spectrum_id in cdp.replicates[i]: 569 # Duplicate. 570 if len(cdp.replicates[i]) == 2: 571 cdp.replicates.pop(i) 572 573 # More than two spectra: 574 else: 575 cdp.replicates[i].pop(cdp.replicates[i].index(spectrum_id)) 576 577 # No need to check further. 578 break 579 580 # Errors. 581 if hasattr(cdp, 'sigma_I') and cdp.sigma_I.has_key(spectrum_id): 582 del cdp.sigma_I[spectrum_id] 583 if hasattr(cdp, 'var_I') and cdp.var_I.has_key(spectrum_id): 584 del cdp.var_I[spectrum_id] 585 586 # Loop over the spins. 587 for spin in spin_loop(): 588 # Intensity data. 589 if hasattr(spin, 'intensities') and spin.intensities.has_key(spectrum_id): 590 del spin.intensities[spectrum_id]

591 592

593 -def error_analysis():

594 """Determine the peak intensity standard deviation.""" 595 596 # Test if the current pipe exists 597 pipes.test() 598 599 # Test if the sequence data is loaded. 600 if not exists_mol_res_spin_data(): 601 raise RelaxNoSequenceError 602 603 # Test if spectra have been loaded. 604 if not hasattr(cdp, 'spectrum_ids'): 605 raise RelaxError("Error analysis is not possible, no spectra have been loaded.") 606 607 # Peak height category. 608 if cdp.int_method == 'height': 609 # Print out. 610 print("Intensity measure: Peak heights.") 611 612 # Do we have replicated spectra? 613 if hasattr(cdp, 'replicates'): 614 # Print out. 615 print("Replicated spectra: Yes.") 616 617 # Set the errors. 618 __errors_repl() 619 620 # No replicated spectra. 621 else: 622 # Print out. 623 print("Replicated spectra: No.") 624 625 # Set the errors. 626 __errors_height_no_repl() 627 628 # Peak volume category. 629 if cdp.int_method == 'point sum': 630 # Print out. 631 print("Intensity measure: Peak volumes.") 632 633 # Do we have replicated spectra? 634 if hasattr(cdp, 'replicates'): 635 # Print out. 636 print("Replicated spectra: Yes.") 637 638 # Set the errors. 639 __errors_repl() 640 641 # No replicated spectra. 642 else: 643 # Print out. 644 print("Replicated spectra: No.") 645 646 # Set the errors. 647 __errors_repl()

648 649

650 -def intensity_generic(file_data=None, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, data_col=None, sep=None, spin_id=None):

651 """Return the process data from the generic peak intensity file. 652 653 The residue number, heteronucleus and proton names, and peak intensity will be returned. 654 655 656 @keyword file_data: The data extracted from the file converted into a list of lists. 657 @type file_data: list of lists of str 658 @keyword spin_id_col: The column containing the spin ID strings (used by the generic intensity 659 file format). If supplied, the mol_name_col, res_name_col, res_num_col, 660 spin_name_col, and spin_num_col arguments must be none. 661 @type spin_id_col: int or None 662 @keyword mol_name_col: The column containing the molecule name information (used by the generic 663 intensity file format). If supplied, spin_id_col must be None. 664 @type mol_name_col: int or None 665 @keyword res_name_col: The column containing the residue name information (used by the generic 666 intensity file format). If supplied, spin_id_col must be None. 667 @type res_name_col: int or None 668 @keyword res_num_col: The column containing the residue number information (used by the 669 generic intensity file format). If supplied, spin_id_col must be None. 670 @type res_num_col: int or None 671 @keyword spin_name_col: The column containing the spin name information (used by the generic 672 intensity file format). If supplied, spin_id_col must be None. 673 @type spin_name_col: int or None 674 @keyword spin_num_col: The column containing the spin number information (used by the generic 675 intensity file format). If supplied, spin_id_col must be None. 676 @type spin_num_col: int or None 677 @keyword data_col: The column containing the peak intensities. 678 @type data_col: int 679 @keyword sep: The column separator which, if None, defaults to whitespace. 680 @type sep: str or None 681 @keyword spin_id: The spin ID string used to restrict data loading to a subset of all 682 spins. 683 @type spin_id: None or str 684 @raises RelaxError: When the expected peak intensity is not a float. 685 @return: The extracted data as a list of lists. The first dimension corresponds 686 to the spin. The second dimension consists of the proton name, 687 heteronucleus name, spin ID string, and the intensity value. 688 @rtype: list of lists of str, str, str, float 689 """ 690 691 # Strip the data. 692 file_data = strip(file_data) 693 694 # Loop over the data. 695 data = [] 696 for id, value in read_spin_data(file_data=file_data, spin_id_col=spin_id_col, mol_name_col=mol_name_col, res_num_col=res_num_col, res_name_col=res_name_col, spin_num_col=spin_num_col, spin_name_col=spin_name_col, data_col=data_col, sep=sep, spin_id=spin_id): 697 data.append([None, None, id, value, id]) 698 699 # Return the data. 700 return data

701 702

703 -def intensity_nmrview(file_data=None, int_col=None):

704 """Return the process data from the NMRView peak intensity file. 705 706 The residue number, heteronucleus and proton names, and peak intensity will be returned. 707 708 709 @keyword file_data: The data extracted from the file converted into a list of lists. 710 @type file_data: list of lists of str 711 @keyword int_col: The column containing the peak intensity data. The default is 16 for intensities. Setting the int_col argument to 15 will use the volumes (or evolumes). For a non-standard formatted file, use a different value. 712 @type int_col: int 713 @raises RelaxError: When the expected peak intensity is not a float. 714 @return: The extracted data as a list of lists. The first dimension corresponds to the spin. The second dimension consists of the proton name, heteronucleus name, spin ID string, the intensity value, and the original line of text 715 @rtype: list of lists of str, str, str, float, str 716 """ 717 718 # Assume the NMRView file has six header lines! 719 num = 6 720 print(("Number of header lines: " + repr(num))) 721 722 # Remove the header. 723 file_data = file_data[num:] 724 725 # Strip the data. 726 file_data = strip(file_data) 727 728 # The peak intensity column. 729 if int_col == None: 730 int_col = 16 731 if int_col == 16: 732 print('Using peak heights.') 733 if int_col == 15: 734 print('Using peak volumes (or evolumes).') 735 736 # Loop over the file data. 737 data = [] 738 for line in file_data: 739 # Unknown assignment. 740 if line[1] == '{}': 741 warn(RelaxWarning("The assignment '%s' is unknown, skipping this peak." % line[1])) 742 continue 743 744 # The residue number 745 res_num = '' 746 try: 747 res_num = string.strip(line[1], '{') 748 res_num = string.strip(res_num, '}') 749 res_num = string.split(res_num, '.') 750 res_num = res_num[0] 751 except ValueError: 752 raise RelaxError("The peak list is invalid.") 753 754 # Nuclei names. 755 x_name = '' 756 if line[8]!='{}': 757 x_name = string.strip(line[8], '{') 758 x_name = string.strip(x_name, '}') 759 x_name = string.split(x_name, '.') 760 x_name = x_name[1] 761 h_name = '' 762 if line[1]!='{}': 763 h_name = string.strip(line[1], '{') 764 h_name = string.strip(h_name, '}') 765 h_name = string.split(h_name, '.') 766 h_name = h_name[1] 767 768 # Intensity. 769 try: 770 intensity = float(line[int_col]) 771 except ValueError: 772 raise RelaxError("The peak intensity value " + repr(intensity) + " from the line " + repr(line) + " is invalid.") 773 774 # Generate the spin_id. 775 spin_id = generate_spin_id(res_num=res_num, spin_name=x_name) 776 777 # Append the data. 778 data.append([h_name, x_name, spin_id, intensity, line]) 779 780 # Return the data. 781 return data

782 783

784 -def intensity_sparky(file_data=None, int_col=None):

785 """Return the process data from the Sparky peak intensity file. 786 787 The residue number, heteronucleus and proton names, and peak intensity will be returned. 788 789 790 @keyword file_data: The data extracted from the file converted into a list of lists. 791 @type file_data: list of lists of str 792 @keyword int_col: The column containing the peak intensity data (for a non-standard formatted file). 793 @type int_col: int 794 @raises RelaxError: When the expected peak intensity is not a float. 795 @return: The extracted data as a list of lists. The first dimension corresponds to the spin. The second dimension consists of the proton name, heteronucleus name, spin ID string, the intensity value, and the original line of text. 796 @rtype: list of lists of str, str, str, float, str 797 """ 798 799 # The number of header lines. 800 num = 0 801 if file_data[0][0] == 'Assignment': 802 num = num + 1 803 if file_data[1] == '': 804 num = num + 1 805 print("Number of header lines found: %s" % num) 806 807 # Remove the header. 808 file_data = file_data[num:] 809 810 # Strip the data. 811 file_data = strip(file_data) 812 813 # Loop over the file data. 814 data = [] 815 for line in file_data: 816 # The Sparky assignment. 817 assignment = '' 818 res_num = '' 819 h_name = '' 820 x_name = '' 821 intensity = '' 822 823 # Skip non-assigned peaks. 824 if line[0] == '?-?': 825 continue 826 827 # First split by the 2D separator. 828 x_assign, h_assign = split('-', line[0]) 829 830 # The proton info. 831 h_name = split('([A-Z]+)', h_assign)[-2] 832 833 # The heteronucleus info. 834 x_row = split('([A-Z]+)', x_assign) 835 x_name = x_row[-2] 836 837 # The residue number. 838 try: 839 res_num = int(x_row[-3]) 840 except: 841 raise RelaxError("Improperly formatted Sparky file.") 842 843 # The peak intensity column. 844 if int_col == None: 845 int_col = 3 846 847 # Intensity. 848 try: 849 intensity = float(line[int_col]) 850 except ValueError: 851 raise RelaxError("The peak intensity value " + repr(intensity) + " from the line " + repr(line) + " is invalid.") 852 853 # Generate the spin_id. 854 spin_id = generate_spin_id(res_num=res_num, spin_name=x_name) 855 856 # Append the data. 857 data.append([h_name, x_name, spin_id, intensity, line]) 858 859 # Return the data. 860 return data

861 862

863 -def intensity_xeasy(file_data=None, heteronuc=None, proton=None, int_col=None):

864 """Return the process data from the XEasy peak intensity file. 865 866 The residue number, heteronucleus and proton names, and peak intensity will be returned. 867 868 869 @keyword file_data: The data extracted from the file converted into a list of lists. 870 @type file_data: list of lists of str 871 @keyword heteronuc: The name of the heteronucleus as specified in the peak intensity file. 872 @type heteronuc: str 873 @keyword proton: The name of the proton as specified in the peak intensity file. 874 @type proton: str 875 @keyword int_col: The column containing the peak intensity data (for a non-standard formatted file). 876 @type int_col: int 877 @raises RelaxError: When the expected peak intensity is not a float. 878 @return: The extracted data as a list of lists. The first dimension corresponds to the spin. The second dimension consists of the proton name, heteronucleus name, spin ID string, the intensity value, and the original line of text. 879 @rtype: list of lists of str, str, str, float, str 880 """ 881 882 # The columns. 883 w1_col = 4 884 w2_col = 7 885 if int_col == None: 886 int_col = 10 887 888 # Set the default proton dimension. 889 H_dim = 'w1' 890 891 # Determine the number of header lines. 892 num = 0 893 for line in file_data: 894 # Try to see if the intensity can be extracted. 895 try: 896 intensity = float(line[int_col]) 897 except ValueError: 898 num = num + 1 899 except IndexError: 900 num = num + 1 901 else: 902 break 903 print(("Number of header lines found: " + repr(num))) 904 905 # Remove the header. 906 file_data = file_data[num:] 907 908 # Strip the data. 909 file_data = strip(file_data) 910 911 # Determine the proton and heteronucleus dimensions. 912 for line in file_data: 913 # Proton in w1, heteronucleus in w2. 914 if line[w1_col] == proton and line[w2_col] == heteronuc: 915 # Set the proton dimension. 916 H_dim = 'w1' 917 918 # Print out. 919 print("The proton dimension is w1") 920 921 # Don't continue (waste of time). 922 break 923 924 # Heteronucleus in w1, proton in w2. 925 if line[w1_col] == heteronuc and line[w2_col] == proton: 926 # Set the proton dimension. 927 H_dim = 'w2' 928 929 # Print out. 930 print("The proton dimension is w2") 931 932 # Don't continue (waste of time). 933 break 934 935 # Loop over the file data. 936 data = [] 937 for line in file_data: 938 # Test for invalid assignment lines which have the column numbers changed and return empty data. 939 if line[w1_col] == 'inv.': 940 continue 941 942 # The residue number. 943 try: 944 res_num = int(line[5]) 945 except: 946 raise RelaxError("Improperly formatted XEasy file.") 947 948 # Nuclei names. 949 if H_dim == 'w1': 950 h_name = line[w1_col] 951 x_name = line[w2_col] 952 else: 953 x_name = line[w1_col] 954 h_name = line[w2_col] 955 956 # Intensity. 957 try: 958 intensity = float(line[int_col]) 959 except ValueError: 960 raise RelaxError("The peak intensity value " + repr(intensity) + " from the line " + repr(line) + " is invalid.") 961 962 # Generate the spin_id. 963 spin_id = generate_spin_id(res_num=res_num, spin_name=x_name) 964 965 # Append the data. 966 data.append([h_name, x_name, spin_id, intensity, line]) 967 968 # Return the data. 969 return data

970 971

972 -def read(file=None, dir=None, spectrum_id=None, heteronuc=None, proton=None, int_col=None, int_method=None, spin_id_col=None, mol_name_col=None, res_num_col=None, res_name_col=None, spin_num_col=None, spin_name_col=None, sep=None, spin_id=None, ncproc=None):

973 """Read the peak intensity data. 974 975 @keyword file: The name of the file containing the peak intensities. 976 @type file: str 977 @keyword dir: The directory where the file is located. 978 @type dir: str 979 @keyword spectrum_id: The spectrum identification string. 980 @type spectrum_id: str 981 @keyword heteronuc: The name of the heteronucleus as specified in the peak intensity file. 982 @type heteronuc: str 983 @keyword proton: The name of the proton as specified in the peak intensity file. 984 @type proton: str 985 @keyword int_col: The column containing the peak intensity data (used by the generic intensity file format). 986 @type int_col: int 987 @keyword int_method: The integration method, one of 'height', 'point sum' or 'other'. 988 @type int_method: str 989 @keyword spin_id_col: The column containing the spin ID strings (used by the generic intensity file format). If supplied, the mol_name_col, res_name_col, res_num_col, spin_name_col, and spin_num_col arguments must be none. 990 @type spin_id_col: int or None 991 @keyword mol_name_col: The column containing the molecule name information (used by the generic intensity file format). If supplied, spin_id_col must be None. 992 @type mol_name_col: int or None 993 @keyword res_name_col: The column containing the residue name information (used by the generic intensity file format). If supplied, spin_id_col must be None. 994 @type res_name_col: int or None 995 @keyword res_num_col: The column containing the residue number information (used by the generic intensity file format). If supplied, spin_id_col must be None. 996 @type res_num_col: int or None 997 @keyword spin_name_col: The column containing the spin name information (used by the generic intensity file format). If supplied, spin_id_col must be None. 998 @type spin_name_col: int or None 999 @keyword spin_num_col: The column containing the spin number information (used by the generic intensity file format). If supplied, spin_id_col must be None. 1000 @type spin_num_col: int or None 1001 @keyword sep: The column separator which, if None, defaults to whitespace. 1002 @type sep: str or None 1003 @keyword spin_id: The spin ID string used to restrict data loading to a subset of all spins. 1004 @type spin_id: None or str 1005 @keyword ncproc: The Bruker ncproc binary intensity scaling factor. 1006 @type ncproc: int or None 1007 """ 1008 1009 # Test if the current data pipe exists. 1010 pipes.test() 1011 1012 # Test if sequence data is loaded. 1013 if not exists_mol_res_spin_data(): 1014 raise RelaxNoSequenceError 1015 1016 # Test that the intensity measures are identical. 1017 if hasattr(cdp, 'int_method') and cdp.int_method != int_method: 1018 raise RelaxError("The '%s' measure of peak intensities does not match '%s' of the previously loaded spectra." % (int_method, cdp.int_method)) 1019 1020 # Check the intensity measure. 1021 if not int_method in ['height', 'point sum', 'other']: 1022 raise RelaxError("The intensity measure '%s' is not one of 'height', 'point sum', 'other'." % int_method) 1023 1024 # Set the peak intensity measure. 1025 cdp.int_method = int_method 1026 1027 # Extract the data from the file. 1028 file_data = extract_data(file, dir, sep=sep) 1029 1030 # Automatic format detection. 1031 format = autodetect_format(file_data) 1032 1033 # Generic. 1034 if format == 'generic': 1035 # Print out. 1036 print("Generic formatted data file.\n") 1037 1038 # Extract the data. 1039 intensity_data = intensity_generic(file_data=file_data, spin_id_col=spin_id_col, mol_name_col=mol_name_col, res_num_col=res_num_col, res_name_col=res_name_col, spin_num_col=spin_num_col, spin_name_col=spin_name_col, data_col=int_col, sep=sep, spin_id=spin_id) 1040 1041 # NMRView. 1042 elif format == 'nmrview': 1043 # Print out. 1044 print("NMRView formatted data file.\n") 1045 1046 # Extract the data. 1047 intensity_data = intensity_nmrview(file_data=file_data) 1048 1049 # Sparky. 1050 elif format == 'sparky': 1051 # Print out. 1052 print("Sparky formatted data file.\n") 1053 1054 # Extract the data. 1055 intensity_data = intensity_sparky(file_data=file_data, int_col=int_col) 1056 1057 # XEasy. 1058 elif format == 'xeasy': 1059 # Print out. 1060 print("XEasy formatted data file.\n") 1061 1062 # Extract the data. 1063 intensity_data = intensity_xeasy(file_data=file_data, proton=proton, heteronuc=heteronuc, int_col=int_col) 1064 1065 # Add the spectrum id (and ncproc) to the relax data store. 1066 if not hasattr(cdp, 'spectrum_ids'): 1067 cdp.spectrum_ids = [] 1068 if ncproc != None: 1069 cdp.ncproc = {} 1070 if spectrum_id in cdp.spectrum_ids: 1071 raise RelaxError("The spectrum identification string '%s' already exists." % spectrum_id) 1072 else: 1073 cdp.spectrum_ids.append(spectrum_id) 1074 if ncproc != None: 1075 cdp.ncproc[spectrum_id] = ncproc 1076 1077 # Loop over the peak intensity data. 1078 data_flag = False 1079 for i in xrange(len(intensity_data)): 1080 # Extract the data. 1081 H_name, X_name, spin_id, intensity, line = intensity_data[i] 1082 1083 # Skip data. 1084 if (X_name and X_name != heteronuc) or (H_name and H_name != proton): 1085 warn(RelaxWarning("Proton and heteronucleus names do not match, skipping the data %s." % line)) 1086 continue 1087 1088 # Get the spin container. 1089 spin = return_spin(spin_id) 1090 if not spin: 1091 warn(RelaxNoSpinWarning(spin_id)) 1092 continue 1093 1094 # Skip deselected spins. 1095 if not spin.select: 1096 continue 1097 1098 # Initialise. 1099 if not hasattr(spin, 'intensities'): 1100 spin.intensities = {} 1101 1102 # Intensity scaling. 1103 if ncproc != None: 1104 intensity = intensity / float(2**ncproc) 1105 1106 # Add the data. 1107 spin.intensities[spectrum_id] = intensity 1108 1109 # Switch the flag. 1110 data_flag = True 1111 1112 # No data. 1113 if not data_flag: 1114 # Delete all the data. 1115 delete(spectrum_id) 1116 1117 # Raise the error. 1118 raise RelaxError("No data could be loaded from the peak list")

1119 1120

1121 -def replicated(spectrum_ids=None):

1122 """Set which spectra are replicates. 1123 1124 @keyword spectrum_ids: A list of spectrum ids corresponding to replicated spectra. 1125 @type spectrum_ids: list of str 1126 """ 1127 1128 # Test if the current pipe exists 1129 pipes.test() 1130 1131 # Test if spectra have been loaded. 1132 if not hasattr(cdp, 'spectrum_ids'): 1133 raise RelaxError("No spectra have been loaded therefore replicates cannot be specified.") 1134 1135 # Test the spectrum id strings. 1136 for spectrum_id in spectrum_ids: 1137 if spectrum_id not in cdp.spectrum_ids: 1138 raise RelaxError("The peak intensities corresponding to the spectrum id '%s' do not exist." % spectrum_id) 1139 1140 # Initialise. 1141 if not hasattr(cdp, 'replicates'): 1142 cdp.replicates = [] 1143 1144 # Check if the spectrum IDs are already in the list. 1145 found = False 1146 for i in xrange(len(cdp.replicates)): 1147 # Loop over all elements of the first. 1148 for j in xrange(len(spectrum_ids)): 1149 if spectrum_ids[j] in cdp.replicates[i]: 1150 found = True 1151 1152 # One of the spectrum IDs already have a replicate specified. 1153 if found: 1154 # Add the remaining replicates to the list and quit this function. 1155 for j in xrange(len(spectrum_ids)): 1156 if spectrum_ids[j] not in cdp.replicates[i]: 1157 cdp.replicates[i].append(spectrum_ids[j]) 1158 1159 # Nothing more to do. 1160 return 1161 1162 # A new set of replicates. 1163 cdp.replicates.append(spectrum_ids)

1164 1165

1166 -def replicated_flags():

1167 """Create and return a dictionary of flags of whether the spectrum is replicated or not. 1168 1169 @return: The dictionary of flags of whether the spectrum is replicated or not. 1170 @rtype: dict of bool 1171 """ 1172 1173 # Initialise all IDs to false. 1174 repl = {} 1175 for id in cdp.spectrum_ids: 1176 repl[id] = False 1177 1178 # Loop over the replicates. 1179 for i in range(len(cdp.replicates)): 1180 for j in range(len(cdp.replicates[i])): 1181 repl[cdp.replicates[i][j]] = True 1182 1183 # Return the dictionary. 1184 return repl

1185 1186

1187 -def replicated_ids(spectrum_id):

1188 """Create and return a list of spectra ID which are replicates of the given ID. 1189 1190 @param spectrum_id: The spectrum ID to find all the replicates of. 1191 @type spectrum_id: str 1192 @return: The list of spectrum IDs which are replicates of spectrum_id. 1193 @rtype: list of str 1194 """ 1195 1196 # Initialise the ID list. 1197 repl = [] 1198 1199 # Loop over the replicate lists. 1200 for i in range(len(cdp.replicates)): 1201 # The spectrum ID is in the list. 1202 if spectrum_id in cdp.replicates[i]: 1203 # Loop over the inner list. 1204 for j in range(len(cdp.replicates[i])): 1205 # Spectrum ID match. 1206 if spectrum_id == cdp.replicates[i][j]: 1207 continue 1208 1209 # Append the replicated ID. 1210 repl.append(cdp.replicates[i][j]) 1211 1212 # Sort the list. 1213 repl.sort() 1214 1215 # Remove duplicates (backward). 1216 id = repl[-1] 1217 for i in range(len(repl)-2, -1, -1): 1218 # Duplicate. 1219 if id == repl[i]: 1220 del repl[i] 1221 1222 # Unique. 1223 else: 1224 id = repl[i] 1225 1226 # Return the list. 1227 return repl

1228

Source Code for Module generic_fns.spectrum