bmrblib.pystarlib.TagTable

1 """ 2 Classes for dealing with STAR syntax 3 """ 4 from bmrblib.pystarlib.Text import pattern_quotes_needed 5 from bmrblib.pystarlib.Text import quotes_add 6 from bmrblib.pystarlib.Text import pattern_quotes_needed_2 7 from bmrblib.pystarlib.Text import pattern_quoted 8 from bmrblib.pystarlib.Text import tag_value_quoted_parse 9 from bmrblib.pystarlib.Text import pattern_tags_loop 10 from bmrblib.pystarlib.Text import pattern_tags_loop_2 11 from bmrblib.pystarlib.Text import pattern_tagname_2 12 from bmrblib.pystarlib.Text import pattern_tagtable_stop_2 13 from bmrblib.pystarlib.Text import pattern_tagtable_loop_2 14 from bmrblib.pystarlib.Text import pattern_unquoted_find 15 from bmrblib.pystarlib.Text import pattern_tag_name 16 from bmrblib.pystarlib.Text import tag_value_parse 17 from bmrblib.pystarlib.Utils import Lister 18 from bmrblib.pystarlib.Utils import transpose 19 20 import types 21 import re 22 23 24 """ 25 Looped and free tags can not be mixed in same object. 26 """

27 -class TagTable (Lister):

28 """ 29 In initializing the class a content has to be given!!! 30 If not then the class will make something up and it won't 31 be pretty but it will be following legal syntax. Usual 32 case is to call the parse method with some text and position. 33 """

34 - def __init__( self, 35 free = None, 36 title = '', 37 tagnames = None, 38 tagvalues = None, 39 verbosity = 2 40 ):

41 self.free = free 42 self.title = title 43 44 # Modified tagnames, tagvalues initialization so list references 45 # are not carried through (Wim 14/07/2002) 46 47 self.tagnames = tagnames 48 if self.tagnames == None: 49 self.tagnames = [ '_Dummy_tag' ] 50 51 self.tagvalues = tagvalues 52 if self.tagvalues == None: 53 self.tagvalues = [ [None] ] 54 55 self.verbosity = verbosity

56 57 "Returns the STAR text representation"

58 - def star_text ( self, 59 flavor = 'NMR-STAR' 60 ):

61 ## Info herein can be transferred to a STAR reference file too 62 if flavor == None or flavor == 'NMR-STAR': 63 # Number of spaces before the loop_ tag. 0 in CIF 64 loop_ident_size = 3 65 elif flavor == 'mmCIF': 66 loop_ident_size = 0 67 else: 68 print('ERROR: Unknown flavor of STAR given', flavor) 69 return 1 70 71 free_ident_size = loop_ident_size 72 tagnames_ident_size = loop_ident_size + 3 73 show_stop_tag = 1 74 75 str = '' 76 count = -1 77 count_hash = 100000 # Show progress hashes while composing text for each count_hash number of values approximately 78 79 ## Free tags here 80 if self.free: 81 i = 0 82 for tagname in self.tagnames: 83 ## Just format it such that it will take the least space 84 if pattern_quotes_needed.search( self.tagvalues[i][0] ): 85 tagvalue = quotes_add( self.tagvalues[i][0] ) 86 else: 87 tagvalue = self.tagvalues[i][0] 88 str = str + free_ident_size * ' ' + "%s %s" % ( tagname, tagvalue ) 89 if tagvalue[ -1 ] != '\n': 90 str = str + '\n' 91 i = i + 1 92 return str 93 94 ## Loop tags here 95 str = str + loop_ident_size * ' ' + 'loop_\n' 96 97 for tagname in self.tagnames: 98 ## Just format it such that it will take the least space 99 str = str + tagnames_ident_size * ' ' + '%s\n' % tagname 100 str = str + '\n' 101 102 col_count = len( self.tagnames ) 103 row_count = len( self.tagvalues[0] ) 104 col_range = list(range( col_count)) 105 row_range = list(range( row_count)) 106 107 str_row = [] 108 row_id = 0 109 tag_id = 0 110 ## This will quicken further actions and in itself is rather 111 ## quick as we're using build in functions. I need to do this because 112 ## I don't know of any splicing method that can get me a row from 113 ## the table. The command below clocked 0.2 sec. on 0.6 Mb table (including 114 ## spaces before parsing it was 1 Mb) 115 ## Any quicker method in other modules? 116 tagvalues_tr = transpose( self.tagvalues ) 117 118 for row_id in row_range: 119 120 str_tmp = ','.join(tagvalues_tr[row_id]) 121 122 ## Are quotes needed? Do it per row first to get some speed perhaps 123 match_quotes_needed_2 = pattern_quotes_needed_2.search( str_tmp ) 124 if match_quotes_needed_2: 125 str_tmp = '' 126 for col_id in col_range: 127 ## Just format it such that it will take the least space 128 if pattern_quotes_needed.search( self.tagvalues[col_id][row_id] ): 129 str_tmp = str_tmp + '%s ' % quotes_add( self.tagvalues[col_id][row_id] ) 130 else: 131 str_tmp = str_tmp + '%s ' % self.tagvalues[col_id][row_id] 132 else: 133 str_tmp = ' '.join(tagvalues_tr[row_id]) 134 135 str_row.append( str_tmp ) 136 137 ## Perhaps delete for speed later on... 138 if self.verbosity > 1: 139 tag_id = tag_id + col_count 140 if tag_id - count > count_hash: 141 count = tag_id 142 if self.verbosity >= 9: 143 print('##### %s looped tag values collected ######' % count_hash) 144 145 if show_stop_tag: 146 str_row.append( '\n' + loop_ident_size * ' ' + 'stop_\n' ) 147 148 str = str + '\n'.join(str_row) 149 150 # Save some space 151 del tagvalues_tr 152 153 return str

154 155 """ 156 A title identifing a tagtable by its tagnames 157 simply the space separated concatenation of the tag names 158 """

159 - def set_title ( self ):

160 if self.verbosity >= 9: 161 print('Setting title of tagtable') 162 self.title = ''.join(self.tagnames)

163 164 165 """ 166 Size and type checks to be extended 167 0 Only fast checks 168 9 Type checks of each element 169 """

170 - def check_integrity( self, check_type=0 ):

171 172 names_length = len(self.tagnames) 173 values_length = len(self.tagvalues) 174 175 if names_length != values_length: 176 print("ERROR: names_length[%s] != values_length[%s]:" % ( 177 names_length, values_length )) 178 print("ERROR: names:", self.tagnames) 179 return 1 180 181 column_length_first = len( self.tagvalues[ 0 ] ) 182 for tag_id in range( values_length ): 183 if len( self.tagvalues[ tag_id ] ) != column_length_first: 184 print("ERROR: length column[%s](%s) is not the same as" % ( 185 self.tagnames[ tag_id], 186 len( self.tagvalues[ tag_id ] ) )) 187 print("ERROR: length column[%s](%s)" % ( 188 self.tagnames[ 0], 189 column_length_first )) 190 return 1 191 192 if check_type >= 9: 193 cols = list(range( names_length)) 194 rows = list(range( column_length_first)) 195 for row_id in rows: 196 for col_id in cols: 197 val_type = type(self.tagvalues[col_id][row_id]) 198 if val_type != bytes: 199 print("ERROR: type %s is not allowed as a value in a tagtable" % val_type) 200 print("ERROR: found for tagtable[%s][%s]" % ( self.tagnames[ col_id ], row_id )) 201 return 1 202 203 if self.verbosity >= 9: 204 print('Checked integrity of TagTable (%2s names %4s values each): OK [%s]' % ( 205 names_length, column_length_first, self.title )) 206 return 0

207 208 209 """ 210 - Parses text into a tagtable. 211 - Returns the position in the string with the first non-white space 212 character after the tagtable or the length of the text in case all 213 was parsed. Just to be verbose, if the tagtable is ended by a save_ 214 then the starting position of the save_ will be returned. 215 - Assumption here is that ;; blocks are collapsed, see Text functions 216 - For speed purposes I scan ahead to see how far I can go before 217 hitting a quoted tag value. I estimate in the large tables only 1 in 218 1000 has a ;; block and only 1 in 5-10 has '' or "" block. For the part 219 that is not quoted the parsing can be really fast. 220 """

221 - def parse( self, 222 text = '', 223 pos = 0 ):

224 ## Parse free tagtable reading all tag name/value pairs 225 if self.free: 226 pos = self._tagtable_free_parse( text, pos ) 227 if pos == None: 228 print("ERROR: tagtable_free_parse returned with ERROR") 229 return None 230 else: 231 return pos 232 if self.check_integrity(): 233 print("ERROR: integrity of parsed table is not ok") 234 return None 235 236 ## Parse looped tagtable 237 # Tag names 238 match_tags_loop = pattern_tags_loop.search(text, pos) 239 if not match_tags_loop: 240 print("ERROR: No tag names found for looped tagtable") 241 return None 242 243 ## Do a limited search with findall for tag names 244 match_tags_loop_2 = pattern_tags_loop_2.findall(text, 245 pos, 246 match_tags_loop.end() ) 247 for m in match_tags_loop_2: 248 self.tagnames.append( m ) 249 pos = match_tags_loop.end() 250 251 # End of loop 252 ## There is no escaping these expensive searches if we can't depend 253 ## on a stop sign 254 ## Can be optimized further... by looking only up to the 255 ## position already know to have a stop sign. Problem is that this 256 ## is different for NMR-STAR (\sstop_) and mmCIF (\sloop_ or \s_\S) 257 ## The (\ssave_) is included for when there are more flavors... 258 259 text_length = len(text) 260 if pos == text_length: 261 print("ERROR: No tag values found for looped tagtable") 262 return None 263 264 ## pos_sf_begin_or_end_nws = pattern_unquoted_find(text, pattern_sf_begin_or_end, pos) 265 pos_tagtable_loop = pattern_unquoted_find(text, pattern_tagtable_loop_2, pos) 266 pos_tagtable_stop = pattern_unquoted_find(text, pattern_tagtable_stop_2, pos) 267 pos_tagname = pattern_unquoted_find(text, pattern_tagname_2, pos) 268 269 ## Find the first one and set the end postion to the beginning of 270 ## the match excluding the beginning white space character 271 pos_end = text_length 272 if pos_tagtable_loop != -1 and pos_tagtable_loop<pos_end: 273 pos_end = pos_tagtable_loop + 1 274 if pos_tagtable_stop != -1 and pos_tagtable_stop<pos_end: 275 pos_end = pos_tagtable_stop + 1 276 if pos_tagname != -1 and pos_tagname<pos_end: 277 pos_end = pos_tagname + 1 278 279 if self.verbosity >= 9: 280 print('pos_tagtable_loop:', pos_tagtable_loop) 281 print('pos_tagtable_stop:', pos_tagtable_stop) 282 print('pos_tagname :', pos_tagname) 283 print('Will parse tagtable text to end at position: [%s]' % pos_end) 284 285 ## Just checking 286 if not ( pos_tagtable_loop!=-1 or pos_tagtable_stop!=-1 or pos_tagname!=-1 ): 287 if self.verbosity > 1: 288 pass 289 # print 'WARNING: EOF in tagtable, must be a CIF file' 290 ## print 'Items looked for are a begin or end of a saveframe, or' 291 ## print 'a begin (loop_) or end (stop_ or _tagname) of a tagtable' 292 ## print '(free or looped).' 293 ## print 'Actually the begin/end of saveframe is not checked since' 294 ## print 'NMR-STAR and mmCIF both end a tagtable without it.' 295 296 # Tag values 297 if self._tagtable_loop_values_parse( 298 text, pos, pos_end): ## will set title too 299 print("ERROR: not parsed table") 300 return None 301 ## Set the position to the end of this tagtable at the beginning 302 ## of a stop_ or a new tagtable 303 pos = pos_end 304 305 ## Skip the stop sign and empty space if it was stop_ 306 if pos_tagtable_stop != -1: 307 ## Try a match from the previously found position including 308 ## the white space char before it. 309 match_tagtable_stop = pattern_tagtable_stop_2.search( text, pos-1 ) 310 if not match_tagtable_stop: 311 print("ERROR: no stop_ on second try") 312 return None 313 pos = match_tagtable_stop.end() 314 315 if self.check_integrity(): 316 print("ERROR: integrity of parsed table is not ok") 317 return None 318 return pos

319 320 321 """ 322 Parse names and values of free tagtable loop from pos 323 returns new position alias status (None for failure) 324 """

325 - def _tagtable_free_parse( self, text, pos ):

326 327 text_length = len(text) 328 329 while pos < text_length - 1: 330 if text[pos] != '_': 331 break 332 # Tag name 333 match_tag_name = pattern_tag_name.search(text, pos) 334 if match_tag_name: 335 if ( match_tag_name.start() - pos ) != 0: 336 print("ERROR: looking for a free tag name (0)") 337 return None 338 else: 339 print("ERROR: looking for a free tag name(1)") 340 return None 341 self.tagnames.append( match_tag_name.group(1) ) 342 pos = match_tag_name.end() 343 # Tag value 344 value, pos = tag_value_parse(text, pos) 345 if pos == 0: 346 print("ERROR: looking for a free tag name(1)") 347 return None 348 ## Structures of free and looped tagtable are the same 349 self.tagvalues.append( [ value ] ) 350 if self.verbosity >= 9: 351 print('**Parsed tag name : [%s] and value [%s]: ' % ( 352 match_tag_name.group(1), value)) 353 self.set_title() 354 return pos

355 356 357 """ 358 Parse values of tagtable loop from pos to pos_end 359 returns status (None for success, 1 for failure) 360 """

361 - def _tagtable_loop_values_parse( self, text, pos, pos_end):

362 363 if self.free: 364 print("ERROR: This is a 'free' tagtable, only looped tagtable can be parsed") 365 return 1 366 names_length = len(self.tagnames) 367 ## Empty the table 368 self.tagvalues = [] 369 for dummy in range( names_length ): 370 self.tagvalues.append( [] ) 371 372 ## Get rid of initial white space if any, shouldn't be needed 373 match_white_space = re.compile('\s+').search( text, pos, pos_end ) 374 if match_white_space: 375 if match_white_space.start() == 0: # Match has to start at the beginning 376 pos = match_white_space.end() 377 378 tag_id = 0 379 count = 0 # Last number of characters at which a print occured. 380 count_hash = 100000 381 text_length = len(text) 382 383 ## Only process characters to predetermined end (exclusive) 384 while pos < pos_end: 385 if self.verbosity > 2: 386 if pos - count > count_hash: 387 print('DEBUG: ##### %s chars processed ######' % count_hash) 388 count = pos 389 ## 1 char search; ', ", or ; at beginning of line 390 match_quoted = pattern_quoted.search( text, pos, pos_end ) 391 if match_quoted: 392 if match_quoted.start() == pos: # quoted at the beginning 393 ## Quoted at pos 394 value, pos = tag_value_quoted_parse( text, pos ) 395 if pos == None: 396 print('ERROR: got error in parse (1)') 397 return 1 398 if pos > pos_end: 399 print('ERROR: found a quoted value that was not wholly within boundaries (1)') 400 return 1 401 self.tagvalues[ tag_id ].append( value ) 402 tag_id += 1 403 if tag_id == names_length: 404 tag_id = 0 405 else: # quoted but not at the beginning 406 # Wim 25/09/03: Changed following to allow correct parsing of H5'' type names 407 # and "asdfasdf'" type stuff 408 # New positions depend on whether correct quote or not 409 # If not correct quote, reset pos and do 'normal' parse 410 idxstart = match_quoted.start() 411 c = text[idxstart] 412 bc = text[idxstart-1] 413 if (c == "'" or c == '"') and bc != " ": 414 # JFD the next line takes an expensive slice of the pie? 415 # tempendpos = idxstart + text[idxstart:].find(' ') 416 tempendpos = text.find(' ', idxstart) 417 else: 418 tempendpos = idxstart 419 420 ## Parse all unquoted tag values beginning from position 421 ## UP TO specified end position 422 ## NOT QUOTED 423 for t in text[pos:tempendpos].split(): 424 self.tagvalues[tag_id].append( t ) 425 tag_id += 1 426 if tag_id == names_length: 427 tag_id = 0 428 if tempendpos == match_quoted.start(): 429 ## QUOTED: 430 pos = tempendpos 431 value, pos = tag_value_quoted_parse( text, pos ) 432 if pos == None: 433 print('ERROR: got error in parse (2)') 434 return 1 435 if pos > pos_end: 436 print('ERROR: found a quoted value that was not wholly within boundaries (2)') 437 return 1 438 self.tagvalues[ tag_id ].append( value ) 439 tag_id += 1 440 if tag_id == names_length: 441 tag_id = 0 442 else: 443 pos = tempendpos 444 else: # NOT quoted until end (only executed once) 445 for t in text[pos:pos_end].split(): 446 self.tagvalues[tag_id].append( t ) 447 tag_id += 1 448 if tag_id == names_length: 449 tag_id = 0 450 pos = text_length # Needed to break while loop 451 452 col_length = len( self.tagvalues[-1] ) 453 if tag_id != 0: 454 print("ERROR: not correct number of tag values read") 455 print("Read [%s] tag(s) that is:" \ 456 % ( col_length * names_length + tag_id )) 457 print("[%s] row(s) complete and [%s] tag value(s) in last row that is incomplete." \ 458 % ( col_length, tag_id )) 459 print("Tag names of this table are:") 460 print(self.tagnames) 461 for xxx in range(0, len(self.tagvalues[0])): 462 for yyy in range(0, len(self.tagvalues)): 463 print(self.tagvalues[yyy][xxx]) 464 print('-----------------------------------------------') 465 pos = 0 466 while pos < tag_id: 467 print(self.tagvalues[pos][-1]) 468 pos = pos + 1 469 return 1 470 471 if col_length == 0: 472 print("ERROR: no tag values parsed") 473 return 1 474 475 # Set the title 476 self.set_title() 477 return None

478

Source Code for Module bmrblib.pystarlib.TagTable