bmrblib.pystarlib.Text

Source Code for Module bmrblib.pystarlib.Text

1 """ 2 Classes for dealing with STAR syntax 3 """ 4 5 __author__ = "$Author: jurgenfd $" 6 ___revision__ = "$Revision: 13 $" 7 ___date__ = "$Date: 2007-08-22 20:59:28 +0200 (Wed, 22 Aug 2007) $" 8 9 ## Standard modules 10 import re 11 12 """ 13 Some handy patterns and functions for dealing with text in the STAR syntax. 14 Some are complicated because in Python the none-greedy pattern matching 15 gets too recursive and will actually bomb on larger strings. Like the 16 following code causes a bomb: 17 re.search( 'a.*?c', 'a' + 99999*'b' + 'c' ) 18 Produces: 'RuntimeError: maximum recursion limit exceeded' 19 """ 20 21 ## Since there are only functions and no classes in this module 22 ## the verbosity may be changed by changing the variable directly. 23 ## I know that's vague but I don't know how to do it yet... todo. 24 verbosity = 2 25 26 ## When not sure if text can have a ; at start of line use 27 ## this string prepended to each line. 28 prepending_string = '[raw] ' 29 30 FREE = 0 31 SINGLE = 1 32 DOUBLE = 2 33 singleq = "'" 34 doubleq = '"' 35 sharp = '#' 36 space = ' ' 37 ## Following string will be replacing the eol in a semicolon block where needed 38 ## It may not contain any funny characters and shouldn't have underscores 39 ## because it will make parsing slower. Parentheses, if used, should be of the 40 ## square type. 41 eol_string = '<eol-string>' 42 eol_string_length = len(eol_string) 43 # Redefined below curiously found this bug with code analysis from pydev extensions 44 # changing the wild import to specific import; that sounds like bad python if it matters. 45 #pattern_tagtable_loop = re.compile(r""" 46 #^\s* loop_ \s* # Begin of loop 47 #( ^\s* (?P<tagname>_\S+) \s*\n )+ # Tag names with some spaces 48 # (?P<rawtext>.+?) # Tag table raw text 49 #^\s* stop_ \s*\n # End of loop 50 # """, re.DOTALL | re.MULTILINE | re.VERBOSE ) 51 52 pattern_semicolon_block = re.compile(r""" 53 ^; # semicolon at begin, any text and then eol 54 .+? # Raw text for match object but not greedy 55 ^; # semicolon at begin, that's it 56 """, re.DOTALL | re.MULTILINE | re.VERBOSE ) 57 58 pattern_eol_string = re.compile( eol_string, re.MULTILINE ) 59 60 ## Next pattern tells when search for on ONE tagvalue if it needs quotes 61 pattern_quotes_needed = re.compile( r'[\s\'\"]|^_|^\#' ) 62 63 ## Next pattern tells when search for on MANY tagvalues if it needs quotes 64 ## The values should be joined by a comma. A value: 'bla,_bla' will be 65 ## mentioned as needing quotes unnecessarily but that's dealt with in the code by further checking 66 pattern_quotes_needed_2= re.compile( r'[\s\'\"]|^_|,_|,\#' ) 67 68 pattern_eoline_etcet = re.compile( r'[\n\r\v\f]' ) 69 # If the quote character is at the end of the word then it is falsely considered to need a 70 # different quote style; this happens frequently for e.g. H1' and all nucleic acid sugar atoms. 71 pattern_single_qoute = re.compile( r"'" ) 72 pattern_double_qoute = re.compile( r'"' ) 73 74 pattern_save_begin = re.compile('save_(\S+)\s+') 75 pattern_save_end = re.compile('save_\s*') 76 pattern_tagtable_loop = re.compile("loop_\s*" ) 77 pattern_tagtable_stop = re.compile("stop_\s*" ) 78 # Same thing but not eating all white space chars, just a minimal match 79 pattern_save_begin_nws = re.compile('save_\S') 80 # Pattern extended to include matches to "save_" as the last characters in a file. 81 # in other words; without a end of line. 82 pattern_save_end_nws = re.compile('(?:save_\s)|(?:save_$)') 83 #pattern_save_end_nws = re.compile('save_\s') 84 pattern_tagtable_loop_nws = re.compile('loop_\s') 85 pattern_tag_name_nws = re.compile('_\S') 86 # Same thing but requiring a prefixed white space char: 87 ##pattern_sf_begin_or_end = re.compile('\ssave_') 88 pattern_tagtable_loop_2 = re.compile('\sloop_\s+' ) 89 pattern_tagtable_stop_2 = re.compile('\sstop_\s+' ) 90 pattern_tagname_2 = re.compile('\s_\S+\s+' ) 91 92 pattern_tag_name = re.compile(r"""(_\S+) \s+ 93 """, re.DOTALL | re.MULTILINE | re.VERBOSE ) 94 pattern_tags_loop = re.compile(r"""(?: (_\S+) \s* )+ 95 """, re.MULTILINE | re.VERBOSE ) 96 pattern_tags_loop_2 = re.compile(r""" (_\S+) \s* 97 """, re.MULTILINE | re.VERBOSE ) 98 99 ## Get any number of non-white space characters followed by any white space 100 pattern_word = re.compile(r"""(\S+)\s*""", re.MULTILINE ) 101 102 pattern_quoted = re.compile(r""" 103 ['"] | # single or double quote 104 (?: ^ ; ) # semicolon at the beginning of a line 105 """, re.MULTILINE | re.VERBOSE ) 106 107 pattern_quoted_2 = re.compile(r"""(?: \b [\'\"] ) | (?: ^ \; )""", re.MULTILINE | re.VERBOSE ) 108 109 pattern_s_quote = re.compile(r"""\'\s+""", re.MULTILINE ) 110 pattern_d_quote = re.compile(r"""\"\s+""", re.MULTILINE ) 111 pattern_e_semicolon = re.compile( eol_string + r"""\;\s*""", re.MULTILINE ) # Added \n for better parsing Wim 01/11/05 112 113 # Set beginning of line BEFORE whitespace - Wim 06/03/2003 114 #pattern_comment_begin = re.compile (r"""^\s*\#.*\n # A string starting a line with a sharp 115 # """, re.MULTILINE | re.VERBOSE) 116 117 pattern_nmrView_compress_empty = re.compile(r""" \{(\s+)\} 118 """, re.MULTILINE | re.VERBOSE) 119 pattern_nmrView_compress_questionmark = re.compile(r""" \{(\s+\?)\} 120 """, re.MULTILINE | re.VERBOSE) 121 # JFD old's 122 #pattern_comment_middle = re.compile (r"""(^[^;^\n] .*? ) # Any string beginning a line other than with a semicolon 123 # (\s \# .* $ ) # Any string ending a line and starting with a sharp 124 # """, re.MULTILINE | re.VERBOSE) 125 126 # Wim's: 127 #pattern_comment_middle = re.compile ( 128 # r""" ( # start group 1 that will be captured for replay. 129 # ^[^;^\n] # not a what? 130 # (?: # start a non-capturing group 131 # ( # start group 2 (capturing?) 132 # [\'][^\']*\#[^\']*[\'] | # get '<text>#<text>' 133 # [\"][^\"]*\#[^\"]*[\"] # get "<text>#<text>" 134 # ) | 135 # [^\#.] 136 # )*? 137 # ) 138 # # Any string beginning a line other than with a semicolon and with no quotes in it 139 # (\s+\#.*)? $ # the comment to be deleted. 140 # # Any string ending a line and starting with a sharp 141 # """, re.MULTILINE | re.VERBOSE) 142 # # Hashes in quotes don't count! 143 # # (?:[\'\"][^\'^\".]*\#[^\'^\".]*[\'\"]|[^\#.])*? ) expression gets '<text>#<text>' blocks, 144 # # is now built into multiline search, seems to be working... (Wim 11/02) 145 # # Changed \s* to \s+ - comments can only start with a ' ' before the '#' (Wim 05/03) 146 # # Removed . from [^\'^\".] in regular expression described above: more generic (Wim 05/03) 147 # doesn't catch"""H# # comment""" see testcomments_strip3a 148 # doesn't catch""" 149 #; 150 #foo # comment 151 #;""" 152 153 154 """ 155 Searches for a regular expression in text. 156 The text may not be STAR quoted and must have semicolon blocks collapsed 157 such that the semicolon starts at the beginning of the line. 158 Returns the start position of the match or -1 if it was not found or 159 None if there was an error. 160 161 The function will search the text from given position onwards 162 and checks the chars preceding (up to the line it's in) for quote style. 163 164 WARNINGS: 165 - Don't call it for a text that has no \n and at least 1 other 166 character in it before pos (not fully tested; perhaps possible). 167 - I have not put in extra checks because of needed speed. 168 - No requirements set on what follows the pattern. 169 """ 170

171 -def pattern_unquoted_find(text, pattern, pos=0):

172 while True: 173 match = pattern.search( text, pos) 174 if not match: 175 ## No match at all 176 return -1 177 178 pos = match.start() 179 180 ## Is it the beginning of the string 181 if pos == 0: 182 return 0 183 184 ## Is the first character matched an eol it self 185 if text[pos]=='\n': 186 if verbosity >= 9: 187 print('Found pattern: [%s] at the beginning of a line' % pattern.pattern) 188 return pos 189 190 ## I hope the rfind is optimized to stroll backwards from pos 191 pos_end_of_previous_line = text.rfind('\n', 0, pos) 192 if pos_end_of_previous_line == -1: 193 pos_end_of_previous_line = -1 ## Dangerous rewind? 194 195 line = text[pos_end_of_previous_line+1:pos] 196 # Some dummy value but continue with the test below. 197 if line == '': 198 line = ' ' 199 200 # Not the one 201 if line[0] == ';': 202 if verbosity > 9: 203 print('WARNING: (1) found pattern: [%s] preceded by: [%s]' % ( 204 pattern.pattern, line )) 205 pos = pos + 1 206 continue 207 208 squoted = None 209 dquoted = None 210 for i in line: 211 if i == "'": 212 if not dquoted: 213 squoted = not squoted 214 elif i == '"': 215 if not squoted: 216 dquoted = not dquoted 217 if squoted or dquoted: 218 ## if squoted and dquoted: 219 ## ## Should not be possible to occur, delete when confident 220 ## print "ERROR: code error, mixing of quote styles in line:" 221 ## print "ERROR: [%s]" % line 222 ## return None 223 if verbosity > 1: 224 print('WARNING: (2) found pattern: [%s] preceded by: [%s]' % ( 225 pattern.pattern, line )) 226 227 # Not the one 228 pos = pos + 1 229 continue 230 231 return pos 232 233 234 """ 235 Parse one quoted tag value beginning from position: pos 236 Return the value and the position of the 'cursor' behind the 237 value for the first non white space char. 238 In case of error the position value of None will signal failure. 239 """

240 -def tag_value_quoted_parse( text, pos ):

241 # print 'text: [%s]' % text[pos:pos+80] 242 # print 'pos: [%s]' % pos 243 if text[ pos ] == '"': 244 match_d_quote = pattern_d_quote.search( text, pos+1) 245 if not match_d_quote: 246 print("ERROR: No matching double quote char found for double quote char at offset:", 0) 247 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 248 return None, None 249 ## if verbosity >= 9: 250 ## print "pos, span():", pos, match_d_quote.span() 251 ## print 'Found Q tag value: [%s]' % text[ pos+1:match_d_quote.start() ] 252 return text[ pos+1:match_d_quote.start() ], match_d_quote.end() 253 254 if text[ pos ] == "'": 255 match_s_quote = pattern_s_quote.search( text, pos+1) 256 if not match_s_quote: 257 print("ERROR: No matching single quote char found for single quote char at offset:", 0) 258 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 259 return None, None 260 value = text[ pos+1:match_s_quote.start() ] 261 ## if verbosity >= 9: 262 ## print "pos, span():", pos, match_s_quote.span() 263 ## print 'Found Q tag value: [%s]' % value 264 return value, match_s_quote.end() 265 266 ## Remove check for speed if you want 267 ## This should always be true 268 if text[ pos ] == ";": 269 match_e_semicolon = pattern_e_semicolon.search( text, pos+1) 270 if not match_e_semicolon: 271 print("ERROR: No matching semicolon found for semicolon char at offset:", 0) 272 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 273 return None, None 274 ## print "pos, span():", pos, match_e_semicolon.span() 275 ## Include the first eol and the eol before the semicolon 276 value = text[ pos+1:match_e_semicolon.start()+eol_string_length ] 277 ## Expansion relatively cheap here and harmless if unique string as defined in 278 ## eol_string is indeed unique 279 ## print 'Found Q (semicolon) tag value: unexpanded [%s]' % value 280 ## print '-----------' 281 ## print text[ match_e_semicolon.start()+eol_string_length : match_e_semicolon.start()+eol_string_length + 20] 282 ## print '-----------' 283 value = semicolon_block_expand( value ) 284 ## print 'Found Q (semicolon) tag value: expanded [%s]' % value 285 286 return value, match_e_semicolon.end() 287 288 print("ERROR: Position in text:", pos) 289 print("""ERROR: should contain a ', ", or a ; but was not found:""") 290 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 291 return None, None

292 293 294 """ 295 From text on position pos, read a tag value and return the value and 296 position of the next non-space char. This is the slow parsing method 297 that should only be used for free tags. 298 """

299 -def tag_value_parse( text, pos):

300 301 match_quoted = pattern_quoted.search( text, pos ) 302 if match_quoted: 303 if match_quoted.start() == pos: 304 305 return tag_value_quoted_parse( text, pos ) # Better speed with this code 306 307 match_word = pattern_word.search( text, pos ) 308 if not match_word: 309 print("ERROR: No match for a 'word' at offset:", pos) 310 print("ERROR: Next 70 chars are:", text[ pos:pos+70 ]) 311 return None, None 312 if match_word.start() != pos: 313 print("ERROR: Match for a 'word' at wrong offset:", match_word.start() - pos) 314 print("ERROR: Next 70 chars are:", text[ pos:pos+70 ]) 315 return None, None 316 317 ## Include the first eol and the eol before the semicolon 318 return match_word.group(1), match_word.end()

319 320 321 322 """ 323 See function semicolon_block_collapse that calls this one 324 """

325 -def semicolon_block_replace( matchobj ):

326 #print len(matchobj.group()) 327 return re.sub( '\n', eol_string, matchobj.group() )

328 329 330 """ 331 This function should be called (not semicolon_block_replace) 332 Putting all semicolon separated values on one line 333 by replacing the eol within with a unique key value 334 that is to be remove later on by it's sibling method: 335 semicolon_block_expand. 336 SPEED: 0.6 cpu seconds for a 5 Mb file with 31 blocks and 337 1.3 " 10 " 64 ". 338 """

339 -def semicolon_block_collapse( text ):

340 341 count = 0 342 startpos = 0 343 344 # TODO: this is not good - since text[startpos:] is used it's always the start of a line, so if string starts with ;... 345 pattern_semicolon_only = re.compile("^\;", re.MULTILINE) 346 pattern_semicolon_only_end = re.compile("(^\;\s*)", re.MULTILINE) 347 348 semicolon_start = pattern_semicolon_only.search(text[startpos:]) 349 350 while(semicolon_start): 351 352 count += 1 353 354 startpos = startpos + semicolon_start.start() 355 semicolon_end = pattern_semicolon_only_end.search(text[startpos+1:]) 356 try: 357 endpos = startpos + 1 + semicolon_end.end() - len(semicolon_end.group(1)) + 1 358 except: 359 print("ERROR in semicolon_block_collapse for text starting at: ["+ text[startpos:startpos+100]+ "]") 360 raise 361 362 text_replace = re.sub("\n", eol_string, text[startpos:endpos]) 363 364 # This is bulky and not very elegant but works 365 text= text[0:startpos] + text_replace + text[endpos:] 366 367 startpos = startpos + len(text_replace) 368 369 semicolon_start = pattern_semicolon_only.search(text[startpos:]) 370 371 # Original code: can't handle re matches that are too long 372 #text, count = pattern_semicolon_block.subn( semicolon_block_replace, text ) 373 if verbosity >= 9: 374 print('Done [%s] subs with semicolon blocks' % count) 375 return text

376

377 -def semicolon_block_expand( text ):

378 return pattern_eol_string.sub('\n', text ) 379 380 """ 381 Adds semicolons, single quotes or double quotes depending on 382 need according to star syntax. 383 Does not assume that no quotes exist initially and will strip them if 384 present in pairs only. 385 386 If the possible_bad_char parameter is set (to 1 or higher) then 387 strings that would normally end up in a semicolon delimited blob will 388 have a string inserted at the beginning to it. The string can be the 'p' 389 argument to this function. [TODO] 390 """

391 -def quotes_add( text ):

392 393 preferred_quote='"' # This info should be in a more central spot 394 395 if pattern_eoline_etcet.search( text ): 396 return semicolons_add( text ) 397 398 if pattern_single_qoute.search( text ): 399 single_qoute_match = 1 400 else: 401 single_qoute_match = 0 402 403 if pattern_double_qoute.search( text ): 404 double_qoute_match = 1 405 else: 406 double_qoute_match = 0 407 408 if single_qoute_match and double_qoute_match: 409 return semicolons_add( text ) 410 411 if single_qoute_match: 412 return '"' + text + '"' 413 # Commented out because it leads to the same behaviour 414 if double_qoute_match: 415 return "'" + text + "'" 416 417 ## Space other than end of line, or # sign etc. 418 return preferred_quote + text + preferred_quote

419 420 421 "Strips quotes in pairs and returns new/old string"

422 -def quotes_strip( text ):

423 424 ## Can it be containing quotes? 425 if len(text) <= 1: 426 return text 427 for quote_symbol in [ "\'", '\"' ]: 428 if ( text[0] == quote_symbol and 429 text[-1] == quote_symbol ): 430 return text[1:-1] 431 return text

432 433 434 """ 435 Returns the input with ; delimited, possibly with a string inserted at the beginning. 436 The string value should always be ended by a eol, otherwise 437 the second semicolon can not be the first char on a line. 438 """

439 -def semicolons_add( text, possible_bad_char=None ):

440 if possible_bad_char: 441 lines = text.split('\n') 442 text = '' 443 for line in lines: 444 text = text + prepending_string + line + '\n' 445 ## ## Code repeated for speed 446 ## return "\n;" + text + ";\n" 447 ## else: 448 ## return "\n;" + text + ";\n" 449 #JFD updates 5/23/2006; apparently the text does not always end with an eol. 450 if not text.endswith('\n'): 451 text = text + '\n' 452 return "\n;\n" + text + ";\n"

453 454 """ 455 Strip the STAR comments new style 456 """

457 -def comments_strip( text ):

458 lines = text.split("\n" ) 459 i=0 460 count = 0 461 ls = len(lines) 462 # print "DEBUG: processing lines: ", ls 463 while i<ls: 464 # print "DEBUG: processing A line: ", i 465 line = lines[i] 466 # Scan past semi colon blocks. 467 l = len(line) 468 if l < 1: 469 # print "DEBUG: skipping empty line: " 470 i += 1 471 continue 472 if line[0] == ';': # start a semicolon block 473 # print "DEBUG: found start of semi colon block." 474 i += 1 475 line = lines[i] 476 # print "DEBUG: processing B line: ", i 477 while len(line)==0 or line[0] != ';': 478 i += 1 479 line = lines[i] 480 # print "DEBUG: processing C line: ", i 481 # end a semicolon block 482 else: 483 line = _comments_strip_line(line) 484 if len(line) != l: 485 lines[i] = line 486 # print "Changed from lenght",l,"to line: ["+line+"] at:", i 487 count += 1 488 i += 1 489 490 if verbosity >= 9: 491 print('Done [%s] comment subs' % count) 492 text = "\n".join(lines) 493 return text

494 495 """ 496 Strip the STAR comments for a single line. 497 """

498 -def _comments_strip_line( line ):

499 c=0 500 state = FREE # like to start out free which is possible after donning semicolon blocks. 501 l = len(line) 502 while c < l: # parse range [0,n> where n is length and exclusive. 503 ch=line[c] 504 # print "DEBUG: Processing char '"+ch+"' at "+`c`+" in state:", state 505 if ( ch == sharp and state == FREE and # A sharp in FREE state 506 (c==0 or line[c-1].isspace())): # behind a space or at beginning of a line. 507 # print "DEBUG: Found sharpie" 508 if c==0: 509 return '' 510 return line[0:c] # this is fast. 511 if c==l-1: # c is the last character; leave it alone if it's not a sharpie 512 return line 513 514 if ch == doubleq: 515 if (state == FREE and # new " behind space or at beginning of line 516 (c==0 or line[c-1].isspace())): 517 state = DOUBLE 518 elif state == DOUBLE: 519 if line[c+1].isspace(): # garanteed to exist now. 520 state = FREE 521 elif ch == singleq: 522 if (state == FREE and 523 (c==0 or line[c-1].isspace())): 524 state = SINGLE 525 elif state == SINGLE: 526 if line[c+1].isspace(): 527 state = FREE 528 c += 1 529 return line

530 531 #def comments_stripOld( text ): 532 # # split for profiling 533 # text = _comments_strip1(text) 534 # text = _comments_strip2(text) 535 # return text 536 # 537 #def _comments_strip1( text ): 538 # text, count = pattern_comment_begin.subn( '', text ) 539 # if verbosity >= 9: 540 # print 'Done [%s] subs with comment at beginning of line' % count 541 # return text 542 # 543 #def _comments_strip2( text ): 544 # text, count = pattern_comment_middle.subn( '\g<1>', text ) 545 # if verbosity >= 9: 546 # print 'Done [%s] subs with comment not at beginning of line' % count 547 # return text 548

549 -def nmrView_compress( text ):

550 551 text, count = pattern_nmrView_compress_empty.subn( '{}', text ) 552 print('Compressed [%s] nmrView empty { } tags' % count) 553 554 text, count = pattern_nmrView_compress_questionmark.subn( '{?}', text ) 555 print('Compressed [%s] nmrView question mark { ?} tags' % count) 556 557 return text

558