Author: bugman Date: Fri Feb 20 16:02:29 2015 New Revision: 27704 URL: http://svn.gna.org/viewcvs/relax?rev=27704&view=rev Log: Fix for bug #23295 (https://gna.org/bugs/?23295). This is the PDB secondary structure HELIX and SHEET records not updating when merging molecules. The problem was that the algorithm for changing the molecule numbers for the helix and sheet metadata when calling the structure.read_pdb user function was far too simplistic. Therefore the logic has been completely rewritten. Now the helix and sheet metadata are stored in temporary data structures in the _parse_pdb_ss() method. As the molecules are being read from the PDB records, new data structures containing the original molecule numbers and new molecule numbers are created. The helix and sheet metadata is then stored in the internal structural object via the pack_structs() method, and the molecule indices of the metadata changed based on the two molecule number remapping data structures. Modified: trunk/lib/structure/internal/object.py Modified: trunk/lib/structure/internal/object.py URL: http://svn.gna.org/viewcvs/relax/trunk/lib/structure/internal/object.py?rev=27704&r1=27703&r2=27704&view=diff ============================================================================== --- trunk/lib/structure/internal/object.py (original) +++ trunk/lib/structure/internal/object.py Fri Feb 20 16:02:29 2015 @@ -36,7 +36,7 @@ # relax module imports. from lib import regex from lib.check_types import is_float -from lib.errors import RelaxError, RelaxNoneIntError, RelaxNoPdbError +from lib.errors import RelaxError, RelaxFault, RelaxNoneIntError, RelaxNoPdbError from lib.io import file_root, open_read_file from lib.selection import Selection from lib.sequence import aa_codes_three_to_one @@ -466,7 +466,7 @@ return lines[i:] - def _parse_pdb_ss(self, lines, read_mol=None): + def _parse_pdb_ss(self, lines, read_mol=None, helices=None, sheets=None): """Loop over and parse the PDB secondary structure records. These are the records identified in the PDB version 3.30 documentation at U{http://www.wwpdb.org/documentation/file-format/format33/sect5.html}. @@ -476,6 +476,10 @@ @type lines: list of str @keyword read_mol: The molecule(s) to read from the file, independent of model. The molecules are determined differently by the different parsers, but are numbered consecutively from 1. If set to None, then all molecules will be loaded. @type read_mol: None, int, or list of int + @keyword helices: The list of helix data to append to. + @type helices: list + @keyword sheets: The list of sheet data to append to. + @type sheets: list @return: The remaining PDB lines with the secondary structure records stripped. @rtype: list of str """ @@ -515,14 +519,8 @@ if mol_end_index + 1 not in read_mol: continue - # New molecule indices based on currently loaded data. - mol_init_index += mol_num - mol_end_index += mol_num - # Store the data. - if not hasattr(self, 'helices'): - self.helices = [] - self.helices.append([helix_id, mol_init_index, init_res_name, init_seq_num, mol_end_index, end_res_name, end_seq_num, helix_class, length]) + helices.append([helix_id, mol_init_index, init_res_name, init_seq_num, mol_end_index, end_res_name, end_seq_num, helix_class, length]) # A sheet. if lines[i][:5] == 'SHEET': @@ -541,19 +539,15 @@ continue # New molecule indices based on currently loaded data. - mol_init_index += mol_num - mol_end_index += mol_num mol_cur_index = None if cur_chain_id: - mol_cur_index = self._pdb_chain_id_to_mol_index(cur_chain_id) + mol_num + mol_cur_index = self._pdb_chain_id_to_mol_index(cur_chain_id) mol_prev_index = None if prev_chain_id: - mol_prev_index = self._pdb_chain_id_to_mol_index(prev_chain_id) + mol_num + mol_prev_index = self._pdb_chain_id_to_mol_index(prev_chain_id) # Store the data. - if not hasattr(self, 'sheets'): - self.sheets = [] - self.sheets.append([strand, sheet_id, num_strands, init_res_name, mol_init_index, init_seq_num, init_icode, end_res_name, mol_end_index, end_seq_num, end_icode, sense, cur_atom, cur_res_name, mol_cur_index, cur_res_seq, cur_icode, prev_atom, prev_res_name, mol_prev_index, prev_res_seq, prev_icode]) + sheets.append([strand, sheet_id, num_strands, init_res_name, mol_init_index, init_seq_num, init_icode, end_res_name, mol_end_index, end_seq_num, end_icode, sense, cur_atom, cur_res_name, mol_cur_index, cur_res_seq, cur_icode, prev_atom, prev_res_name, mol_prev_index, prev_res_seq, prev_icode]) # Return the remaining lines. return lines[i:] @@ -1202,7 +1196,7 @@ mol.atom_add(atom_name=atom_names[i], res_name=res_names[i], res_num=res_nums[i], pos=coord[i], element=elements[i]) # Create the structural data data structures. - self.pack_structs([[mol]], orig_model_num=[None], set_model_num=[set_model_num], orig_mol_num=[None], set_mol_name=[set_mol_name]) + self.pack_structs([[mol]], orig_model_num=[None], set_model_num=[set_model_num], orig_mol_num=[[None]], set_mol_name=[set_mol_name]) def add_model(self, model=None, coords_from=None): @@ -1979,7 +1973,7 @@ mol.fill_object_from_gaussian(model_records) # Create the structural data data structures. - self.pack_structs([[mol]], orig_model_num=[1], set_model_num=set_model_num, orig_mol_num=[0], set_mol_name=set_mol_name, file_name=file_name, file_path=path, file_path_abs=path_abs, verbosity=verbosity) + self.pack_structs([[mol]], orig_model_num=[1], set_model_num=set_model_num, orig_mol_num=[[0]], set_mol_name=set_mol_name, file_name=file_name, file_path=path, file_path_abs=path_abs, verbosity=verbosity) # Loading worked. return True @@ -2049,11 +2043,15 @@ # Pre-process the lines, fixing PDB violations. pdb_lines = self._validate_records(pdb_lines) + # Secondary structure data. + helices = [] + sheets = [] + # Process the different sections. pdb_lines = self._parse_pdb_title(pdb_lines) pdb_lines = self._parse_pdb_prim_struct(pdb_lines) pdb_lines = self._parse_pdb_hetrogen(pdb_lines) - pdb_lines = self._parse_pdb_ss(pdb_lines, read_mol=read_mol) + pdb_lines = self._parse_pdb_ss(pdb_lines, read_mol=read_mol, helices=helices, sheets=sheets) pdb_lines = self._parse_pdb_connectivity_annotation(pdb_lines) pdb_lines = self._parse_pdb_misc(pdb_lines) pdb_lines = self._parse_pdb_transform(pdb_lines) @@ -2062,6 +2060,7 @@ model_index = 0 orig_model_num = [] mol_conts = [] + orig_mol_num = [] for model_num, model_records in self._parse_pdb_coord(pdb_lines): # Only load the desired model. if read_model and model_num not in read_model: @@ -2072,8 +2071,8 @@ # Loop over the molecules of the model. mol_conts.append([]) + orig_mol_num.append([]) mol_index = 0 - orig_mol_num = [] new_mol_name = [] for mol_num, mol_records in self._parse_mols_pdb(model_records): # Only load the desired model. @@ -2100,9 +2099,6 @@ # Set the name to the file name plus the structure number. new_mol_name.append(file_root(file) + '_mol' + repr(mol_num+num_struct)) - # Store the original mol number. - orig_mol_num.append(mol_num) - # Generate the molecule container. mol = MolContainer() @@ -2111,6 +2107,9 @@ # Store the molecule container. mol_conts[model_index].append(mol) + + # Store the original molecule number. + orig_mol_num[model_index].append(mol_num) # Increment the molecule index. mol_index = mol_index + 1 @@ -2124,7 +2123,7 @@ return False # Create the structural data data structures. - self.pack_structs(mol_conts, orig_model_num=orig_model_num, set_model_num=set_model_num, orig_mol_num=orig_mol_num, set_mol_name=new_mol_name, file_name=file, file_path=path, file_path_abs=path_abs, merge=merge, verbosity=verbosity) + self.pack_structs(mol_conts, orig_model_num=orig_model_num, set_model_num=set_model_num, orig_mol_num=orig_mol_num, set_mol_name=new_mol_name, file_name=file, file_path=path, file_path_abs=path_abs, merge=merge, helices=helices, sheets=sheets, verbosity=verbosity) # Loading worked. return True @@ -2225,7 +2224,7 @@ model_index = model_index + 1 # Create the structural data data structures. - orig_mol_num = [0] + orig_mol_num = [[0]] self.pack_structs(mol_conts, orig_model_num=orig_model_num, set_model_num=set_model_num, orig_mol_num=orig_mol_num, set_mol_name=new_mol_name, file_name=file, file_path=path, file_path_abs=path_abs, verbosity=verbosity) # Loading worked. @@ -2392,17 +2391,17 @@ return codes - def pack_structs(self, data_matrix, orig_model_num=None, set_model_num=None, orig_mol_num=None, set_mol_name=None, file_name=None, file_path=None, file_path_abs=None, verbosity=1, merge=False): + def pack_structs(self, data_matrix, orig_model_num=None, set_model_num=None, orig_mol_num=None, set_mol_name=None, file_name=None, file_path=None, file_path_abs=None, helices=None, sheets=None, verbosity=1, merge=False): """From the given structural data, expand the structural data data structure. - @param data_matrix: A matrix of structural objects. + @param data_matrix: A matrix of structural objects. The first dimension is the models, the second is the molecules. @type data_matrix: list of lists of structural objects @keyword orig_model_num: The original model numbers (for storage). @type orig_model_num: list of int @keyword set_model_num: The new model numbers (for model renumbering). @type set_model_num: list of int - @keyword orig_mol_num: The original molecule numbers (for storage). - @type orig_mol_num: list of int + @keyword orig_mol_num: The original molecule numbers (for storage). The dimensions match the data matrix. + @type orig_mol_num: list of list of int @keyword set_mol_name: The new molecule names. @type set_mol_name: list of str @keyword file_name: The name of the file from which the molecular data has been extracted. @@ -2411,8 +2410,12 @@ @type file_path: None or str @keyword file_path_abs: The absolute path to the file specified by 'file_name'. This is a fallback mechanism in case results or save files are located somewhere other than the working directory. @type file_path_abs: None or str - @keyword verbosity: The amount of information to print to screen. Zero corresponds to minimal output while higher values increase the amount of output. The default value is 1. - @type verbosity: int + @keyword helices: The helix secondary structure information to store. The first dimension corresponds to each helix element. The second dimension consists of the helix_id, mol_init_index, init_res_name, init_seq_num, mol_end_index, end_res_name, end_seq_num, helix_class, and length. + @type helices: list of lists + @keyword sheets: The sheet secondary structure information to store. + @type sheets: list of lists + @keyword verbosity: The amount of information to print to screen. Zero corresponds to minimal output while higher values increase the amount of output. The default value is 1. + @type verbosity: int @keyword merge: A flag which if set to True will try to merge the structure into the currently loaded structures. @type merge: bool """ @@ -2422,8 +2425,9 @@ raise RelaxError("Structural data mismatch, %s original models verses %s in the structural data." % (len(orig_model_num), len(data_matrix))) # Test the number of molecules. - if len(orig_mol_num) != len(data_matrix[0]): - raise RelaxError("Structural data mismatch, %s original molecules verses %s in the structural data." % (len(orig_mol_num), len(data_matrix[0]))) + for i in range(len(data_matrix)): + if len(orig_mol_num[i]) != len(data_matrix[i]): + raise RelaxError("Structural data mismatch, %s original molecules verses %s in the structural data." % (len(orig_mol_num[i]), len(data_matrix[i]))) # Model numbers do not change. if not set_model_num: @@ -2454,7 +2458,12 @@ raise RelaxError("The molecule '%s' of model %s already exists." % (self.structural_data[i].mol[j].mol_name, self.structural_data[i].num)) # Loop over the models. + store_mol_num_orig = [] + store_mol_num_new = [] for i in range(len(set_model_num)): + store_mol_num_orig.append([]) + store_mol_num_new.append([]) + # The model doesn't currently exist. if set_model_num[i] not in self.structural_data.current_models: # Create the model. @@ -2494,14 +2503,18 @@ # The full text. if merge_new: - print("Merging with molecule '%s'%s (from the original molecule number %s%s)." % (set_mol_name[j], new_model_text, orig_mol_num[j], orig_model_text)) + print("Merging with molecule '%s'%s (from the original molecule number %s%s)." % (set_mol_name[j], new_model_text, orig_mol_num[i][j], orig_model_text)) else: - print("Adding molecule '%s'%s (from the original molecule number %s%s)." % (set_mol_name[j], new_model_text, orig_mol_num[j], orig_model_text)) + print("Adding molecule '%s'%s (from the original molecule number %s%s)." % (set_mol_name[j], new_model_text, orig_mol_num[i][j], orig_model_text)) # The index of the new molecule to add or merge. index = len(model.mol) if merge_new: index -= 1 + + # Store the index+1 as the new molecule number, and store the original molecule number. + store_mol_num_new[i].append(index+1) + store_mol_num_orig[i].append(orig_mol_num[i][j]) # Consistency check. if model.num != self.structural_data[0].num and self.structural_data[0].mol[index].mol_name != set_mol_name[j]: @@ -2518,11 +2531,65 @@ mol.file_name = file_name mol.file_path = file_path mol.file_path_abs = file_path_abs - mol.file_mol_num = orig_mol_num[j] + mol.file_mol_num = orig_mol_num[i][j] mol.file_model = orig_model_num[i] # Sort the structural data if a merge occurred. mol._sort() + + # Consistency checks. + for model_index in range(len(store_mol_num_new)): + if len(store_mol_num_new[model_index]) != len(store_mol_num_orig[model_index]): + raise RelaxFault + + # The helix secondary structure information. + if helices != None: + # Initialise the data structure if required. + if not hasattr(self, 'helices'): + self.helices = [] + + # Loop over each helix. + for i in range(len(helices)): + # Append the data. + self.helices.append(helices[i]) + + # The initial molecule index remapping. + mol_init_index = store_mol_num_new[0][store_mol_num_orig[0].index(helices[i][1]+1)] - 1 + self.helices[-1][1] = mol_init_index + + # The end molecule index remapping. + mol_end_index = store_mol_num_new[0][store_mol_num_orig[0].index(helices[i][4]+1)] - 1 + self.helices[-1][4] = mol_end_index + + # The sheet secondary structure information. + if sheets != None: + # Initialise the data structure if required. + if not hasattr(self, 'sheets'): + self.sheets = [] + + # Loop over each sheet. + for i in range(len(sheets)): + # Append the data. + self.sheets.append(sheets[i]) + + # The initial molecule index remappings. + mol_init_index = store_mol_num_new[0][store_mol_num_orig[0].index(sheets[i][4]+1)] - 1 + self.sheets[-1][4] = mol_init_index + + + # The end molecule index remapping. + mol_end_index = store_mol_num_new[0][store_mol_num_orig[0].index(sheets[i][8]+1)] - 1 + self.sheets[-1][8] = mol_end_index + + # The current molecule index remapping. + if sheets[i][14] != None: + mol_cur_index = store_mol_num_new[0][store_mol_num_orig[0].index(sheets[i][14]+1)] - 1 + self.sheets[-1][14] = mol_cur_index + + # The previous molecule index remapping. + if sheets[i][19] != None: + mol_prev_index = store_mol_num_new[0][store_mol_num_orig[0].index(sheets[i][19]+1)] - 1 + self.sheets[-1][19] = mol_prev_index def rotate(self, R=None, origin=None, model=None, selection=None):