| Trees | Indices | Help | 
 | 
|---|
|  | 
  1  """ 
  2  Classes for dealing with STAR syntax 
  3  """ 
  4   
  5  __author__    = "$Author: jurgenfd $" 
  6  ___revision__ = "$Revision: 13 $" 
  7  ___date__     = "$Date: 2007-08-22 20:59:28 +0200 (Wed, 22 Aug 2007) $" 
  8   
  9  ## Standard modules 
 10  import re 
 11   
 12  """ 
 13  Some handy patterns and functions for dealing with text in the STAR syntax. 
 14  Some are complicated because in Python the none-greedy pattern matching 
 15  gets too recursive and will actually bomb on larger strings. Like the 
 16  following code causes a bomb: 
 17  re.search( 'a.*?c', 'a' + 99999*'b' + 'c' ) 
 18  Produces: 'RuntimeError: maximum recursion limit exceeded' 
 19  """ 
 20   
 21  ## Since there are only functions and no classes in this module 
 22  ## the verbosity may be changed by changing the variable directly. 
 23  ## I know that's vague but I don't know how to do it yet... todo. 
 24  verbosity           = 2 
 25   
 26  ## When not sure if text can have a ; at start of line use 
 27  ## this string prepended to each line. 
 28  prepending_string   = '[raw] ' 
 29   
 30  FREE = 0 
 31  SINGLE = 1 
 32  DOUBLE = 2 
 33  singleq = "'" 
 34  doubleq = '"' 
 35  sharp   = '#' 
 36  space   = ' ' 
 37  ## Following string will be replacing the eol in a semicolon block where needed 
 38  ## It may not contain any funny characters and shouldn't have underscores 
 39  ## because it will make parsing slower. Parentheses, if used, should be of the 
 40  ## square type. 
 41  eol_string = '<eol-string>' 
 42  eol_string_length = len(eol_string) 
 43  # Redefined below curiously found this bug with code analysis from pydev extensions 
 44  # changing the wild import to specific import; that sounds like bad python if it matters. 
 45  #pattern_tagtable_loop = re.compile(r""" 
 46  #^\s*  loop_  \s*                                  # Begin of loop 
 47  #(   ^\s*     (?P<tagname>_\S+) \s*\n  )+          # Tag names with some spaces 
 48  #       (?P<rawtext>.+?)                           # Tag table raw text 
 49  #^\s*  stop_  \s*\n                                # End of loop 
 50  #     """, re.DOTALL | re.MULTILINE | re.VERBOSE ) 
 51   
 52  pattern_semicolon_block = re.compile(r""" 
 53      ^;                                          # semicolon at begin, any text and then eol 
 54      .+?                                         # Raw text for match object but not greedy 
 55      ^;                                          # semicolon at begin, that's it 
 56       """, re.DOTALL | re.MULTILINE | re.VERBOSE ) 
 57   
 58  pattern_eol_string     = re.compile( eol_string, re.MULTILINE ) 
 59   
 60  ## Next pattern tells when search for on ONE tagvalue if it needs quotes 
 61  pattern_quotes_needed  = re.compile( r'[\s\'\"]|^_|^\#' )  
 62   
 63  ## Next pattern tells when search for on MANY tagvalues if it needs quotes 
 64  ## The values should be joined by a comma. A value: 'bla,_bla' will be 
 65  ## mentioned as needing quotes unnecessarily but that's dealt with in the code by further checking 
 66  pattern_quotes_needed_2= re.compile( r'[\s\'\"]|^_|,_|,\#' )  
 67   
 68  pattern_eoline_etcet   = re.compile( r'[\n\r\v\f]' ) 
 69  # If the quote character is at the end of the word then it is falsely considered to need a  
 70  # different quote style; this happens frequently for e.g. H1' and all nucleic acid sugar atoms. 
 71  pattern_single_qoute   = re.compile( r"'" ) 
 72  pattern_double_qoute   = re.compile( r'"' ) 
 73   
 74  pattern_save_begin      = re.compile('save_(\S+)\s+') 
 75  pattern_save_end        = re.compile('save_\s*') 
 76  pattern_tagtable_loop   = re.compile("loop_\s*" ) 
 77  pattern_tagtable_stop   = re.compile("stop_\s*" ) 
 78  # Same thing but not eating all white space chars, just a minimal match 
 79  pattern_save_begin_nws      = re.compile('save_\S') 
 80  # Pattern extended to include matches to "save_" as the last characters in a file. 
 81  # in other words; without a end of line. 
 82  pattern_save_end_nws        = re.compile('(?:save_\s)|(?:save_$)') 
 83  #pattern_save_end_nws        = re.compile('save_\s') 
 84  pattern_tagtable_loop_nws   = re.compile('loop_\s') 
 85  pattern_tag_name_nws        = re.compile('_\S') 
 86  # Same thing but requiring a prefixed white space char: 
 87  ##pattern_sf_begin_or_end = re.compile('\ssave_') 
 88  pattern_tagtable_loop_2 = re.compile('\sloop_\s+' ) 
 89  pattern_tagtable_stop_2 = re.compile('\sstop_\s+' ) 
 90  pattern_tagname_2       = re.compile('\s_\S+\s+' ) 
 91   
 92  pattern_tag_name = re.compile(r"""(_\S+) \s+ 
 93       """, re.DOTALL | re.MULTILINE | re.VERBOSE ) 
 94  pattern_tags_loop       = re.compile(r"""(?: (_\S+) \s* )+ 
 95       """, re.MULTILINE | re.VERBOSE ) 
 96  pattern_tags_loop_2     = re.compile(r"""    (_\S+) \s* 
 97       """, re.MULTILINE | re.VERBOSE ) 
 98   
 99  ## Get any number of non-white space characters followed by any white space 
100  pattern_word            = re.compile(r"""(\S+)\s*""", re.MULTILINE ) 
101   
102  pattern_quoted = re.compile(r""" 
103          ['"] |                          # single or double quote 
104      (?: ^ ; )                           # semicolon at the beginning of a line 
105       """, re.MULTILINE | re.VERBOSE ) 
106   
107  pattern_quoted_2 = re.compile(r"""(?: \b [\'\"] ) | (?: ^  \;     )""", re.MULTILINE | re.VERBOSE ) 
108   
109  pattern_s_quote        = re.compile(r"""\'\s+""", re.MULTILINE ) 
110  pattern_d_quote        = re.compile(r"""\"\s+""", re.MULTILINE ) 
111  pattern_e_semicolon    = re.compile( eol_string + r"""\;\s*""", re.MULTILINE ) # Added \n for better parsing Wim 01/11/05 
112   
113  # Set beginning of line BEFORE whitespace - Wim 06/03/2003 
114  #pattern_comment_begin  = re.compile (r"""^\s*\#.*\n           # A string starting a line with a sharp 
115  #                                   """, re.MULTILINE | re.VERBOSE) 
116                      
117  pattern_nmrView_compress_empty = re.compile(r""" \{(\s+)\} 
118                                               """, re.MULTILINE | re.VERBOSE) 
119  pattern_nmrView_compress_questionmark = re.compile(r""" \{(\s+\?)\} 
120                                                      """, re.MULTILINE | re.VERBOSE) 
121  # JFD old's 
122  #pattern_comment_middle = re.compile (r"""(^[^;^\n] .*? )   # Any string beginning a line other than with a semicolon 
123  #                                         (\s \#  .* $  )   # Any string ending a line and starting with a sharp 
124  #                                   """, re.MULTILINE | re.VERBOSE) 
125   
126  # Wim's:                    
127  #pattern_comment_middle = re.compile ( 
128  #     r""" (                                             # start group 1 that will be captured for replay. 
129  #             ^[^;^\n]                                   # not a what? 
130  #             (?:                                        # start a non-capturing group 
131  #                 (                                      # start group 2 (capturing?) 
132  #                  [\'][^\']*\#[^\']*[\'] |              # get '<text>#<text>'  
133  #                  [\"][^\"]*\#[^\"]*[\"]                # get "<text>#<text>" 
134  #                 ) |                 
135  #                 [^\#.]                                  
136  #             )*?  
137  #           )   
138  #          # Any string beginning a line other than with a semicolon and with no quotes in it 
139  #          (\s+\#.*)?    $                                                         # the comment to be deleted. 
140  #          # Any string ending a line and starting with a sharp 
141  #   """, re.MULTILINE | re.VERBOSE) 
142  #    # Hashes in quotes don't count! 
143  #    # (?:[\'\"][^\'^\".]*\#[^\'^\".]*[\'\"]|[^\#.])*? ) expression gets '<text>#<text>' blocks, 
144  #    # is now built into multiline search, seems to be working... (Wim 11/02) 
145  #    # Changed \s* to \s+ - comments can only start with a ' ' before the '#' (Wim 05/03) 
146  #    # Removed . from [^\'^\".] in regular expression described above: more generic (Wim 05/03) 
147  # doesn't catch"""H# # comment""" see testcomments_strip3a 
148  # doesn't catch""" 
149  #; 
150  #foo # comment 
151  #;""" 
152   
153   
154  """ 
155  Searches for a regular expression in text. 
156  The text may not be STAR quoted and must have semicolon blocks collapsed 
157  such that the semicolon starts at the beginning of the line. 
158  Returns the start position of the match or -1 if it was not found or 
159  None if there was an error. 
160   
161  The function will search the text from given position onwards 
162  and checks the chars preceding (up to the line it's in) for quote style. 
163   
164  WARNINGS: 
165  - Don't call it for a text that has no \n and at least 1 other 
166  character in it before pos (not fully tested; perhaps possible). 
167  - I have not put in extra checks because of needed speed. 
168  - No requirements set on what follows the pattern. 
169  """ 
170   
172      while True: 
173          match = pattern.search( text, pos) 
174          if not match: 
175              ## No match at all 
176              return -1 
177           
178          pos = match.start() 
179   
180          ## Is it the beginning of the string 
181          if pos == 0: 
182              return 0 
183   
184          ## Is the first character matched an eol it self 
185          if text[pos]=='\n': 
186              if verbosity >= 9: 
187                  print('Found pattern: [%s] at the beginning of a line' % pattern.pattern) 
188              return pos 
189               
190          ## I hope the rfind is optimized to stroll backwards from pos 
191          pos_end_of_previous_line = text.rfind('\n', 0, pos) 
192          if pos_end_of_previous_line == -1: 
193              pos_end_of_previous_line = -1 ## Dangerous rewind? 
194           
195          line = text[pos_end_of_previous_line+1:pos] 
196          # Some dummy value but continue with the test below. 
197          if line == '': 
198              line = ' ' 
199               
200          # Not the one 
201          if line[0] == ';':  
202              if verbosity > 9: 
203                  print('WARNING: (1) found pattern: [%s] preceded by: [%s]' % ( 
204                      pattern.pattern, line )) 
205              pos = pos + 1 
206              continue 
207   
208          squoted = None 
209          dquoted = None 
210          for i in line: 
211              if      i == "'": 
212                  if not dquoted: 
213                      squoted = not squoted 
214              elif    i == '"': 
215                  if not squoted: 
216                      dquoted = not dquoted 
217          if squoted or dquoted: 
218  ##            if squoted and dquoted: 
219  ##                ## Should not be possible to occur, delete when confident 
220  ##                print "ERROR: code error, mixing of quote styles in line:" 
221  ##                print "ERROR: [%s]" % line 
222  ##                return None 
223              if verbosity > 1: 
224                  print('WARNING: (2) found pattern: [%s] preceded by: [%s]' % ( 
225                      pattern.pattern, line )) 
226   
227              # Not the one 
228              pos = pos + 1  
229              continue 
230   
231          return pos 
232   
233       
234  """ 
235  Parse one quoted tag value beginning from position: pos 
236  Return the value and the position of the 'cursor' behind the 
237  value for the first non white space char. 
238  In case of error the position value of None will signal failure. 
239  """ 
241  #    print 'text: [%s]' % text[pos:pos+80] 
242  #    print 'pos:  [%s]' % pos 
243      if text[ pos ] == '"': 
244          match_d_quote = pattern_d_quote.search( text, pos+1) 
245          if not match_d_quote: 
246              print("ERROR: No matching double quote char found for double quote char at offset:", 0) 
247              print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 
248              return None, None 
249      ##            if verbosity >= 9: 
250      ##                print "pos, span():", pos, match_d_quote.span() 
251      ##                print 'Found Q tag value: [%s]' % text[ pos+1:match_d_quote.start() ] 
252          return text[ pos+1:match_d_quote.start() ], match_d_quote.end() 
253   
254      if text[ pos ] == "'": 
255          match_s_quote = pattern_s_quote.search( text, pos+1) 
256          if not match_s_quote: 
257              print("ERROR: No matching single quote char found for single quote char at offset:", 0) 
258              print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 
259              return None, None 
260          value = text[ pos+1:match_s_quote.start() ] 
261      ##            if verbosity >= 9: 
262      ##                print "pos, span():", pos, match_s_quote.span() 
263      ##                print 'Found Q tag value: [%s]' % value 
264          return value, match_s_quote.end() 
265   
266      ## Remove check for speed if you want 
267      ## This should always be true 
268      if text[ pos ] == ";": 
269          match_e_semicolon = pattern_e_semicolon.search( text, pos+1) 
270          if not match_e_semicolon: 
271              print("ERROR: No matching semicolon found for semicolon char at offset:", 0) 
272              print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 
273              return None, None 
274      ##            print "pos, span():", pos, match_e_semicolon.span() 
275          ## Include the first eol and the eol before the semicolon 
276          value = text[ pos+1:match_e_semicolon.start()+eol_string_length ] 
277          ## Expansion relatively cheap here and harmless if unique string as defined in 
278          ## eol_string is indeed unique 
279          ## print 'Found Q (semicolon) tag value: unexpanded [%s]' % value 
280          ## print '-----------' 
281          ## print text[ match_e_semicolon.start()+eol_string_length : match_e_semicolon.start()+eol_string_length + 20] 
282          ## print '-----------' 
283          value = semicolon_block_expand( value ) 
284          ## print 'Found Q (semicolon) tag value: expanded   [%s]' % value 
285           
286          return value, match_e_semicolon.end()  
287   
288      print("ERROR: Position in text:", pos) 
289      print("""ERROR: should contain a ', ", or a ; but was not found:""") 
290      print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]) 
291      return None, None 
292   
293   
294  """ 
295  From text on position pos, read a tag value and return the value and 
296  position of the next non-space char. This is the slow parsing method 
297  that should only be used for free tags. 
298  """ 
300   
301      match_quoted = pattern_quoted.search( text, pos ) 
302      if match_quoted:       
303          if match_quoted.start() == pos: 
304               
305              return tag_value_quoted_parse( text, pos ) # Better speed with this code 
306                   
307      match_word = pattern_word.search( text, pos ) 
308      if not match_word: 
309          print("ERROR: No match for a 'word' at offset:", pos) 
310          print("ERROR: Next 70 chars are:", text[ pos:pos+70 ]) 
311          return None, None 
312      if match_word.start() != pos: 
313          print("ERROR: Match for a 'word' at wrong offset:", match_word.start() - pos) 
314          print("ERROR: Next 70 chars are:", text[ pos:pos+70 ]) 
315          return None, None 
316   
317      ## Include the first eol and the eol before the semicolon 
318      return  match_word.group(1), match_word.end() 
319   
320   
321   
322  """ 
323  See function semicolon_block_collapse that calls this one 
324  """ 
328   
329   
330  """ 
331  This function should be called (not semicolon_block_replace) 
332  Putting all semicolon separated values on one line 
333  by replacing the eol within with a unique key value 
334  that is to be remove later on by it's sibling method: 
335  semicolon_block_expand. 
336  SPEED:  0.6 cpu seconds for a 5 Mb file with 31 blocks and 
337          1.3 "                10 "            64 ". 
338  """ 
340       
341      count = 0 
342      startpos = 0 
343       
344      # TODO: this is not good - since text[startpos:] is used it's always the start of a line, so if string starts with ;... 
345      pattern_semicolon_only = re.compile("^\;", re.MULTILINE) 
346      pattern_semicolon_only_end = re.compile("(^\;\s*)", re.MULTILINE) 
347       
348      semicolon_start = pattern_semicolon_only.search(text[startpos:]) 
349       
350      while(semicolon_start): 
351   
352        count += 1 
353         
354        startpos = startpos + semicolon_start.start() 
355        semicolon_end = pattern_semicolon_only_end.search(text[startpos+1:]) 
356        try: 
357          endpos = startpos + 1 + semicolon_end.end() - len(semicolon_end.group(1)) + 1 
358        except: 
359          print("ERROR in semicolon_block_collapse for text starting at: ["+ text[startpos:startpos+100]+ "]")             
360          raise 
361       
362        text_replace = re.sub("\n", eol_string, text[startpos:endpos]) 
363   
364        # This is bulky and not very elegant but works 
365        text= text[0:startpos] + text_replace + text[endpos:] 
366       
367        startpos = startpos + len(text_replace) 
368       
369        semicolon_start = pattern_semicolon_only.search(text[startpos:]) 
370      
371      # Original code: can't handle re matches that are too long 
372      #text, count = pattern_semicolon_block.subn( semicolon_block_replace, text ) 
373      if verbosity >= 9: 
374          print('Done [%s] subs with semicolon blocks' % count) 
375      return text 
376   
378      return pattern_eol_string.sub('\n', text ) 
379   
380  """ 
381  Adds semicolons, single quotes or double quotes depending on 
382  need according to star syntax. 
383  Does not assume that no quotes exist initially and will strip them if 
384  present in pairs only. 
385   
386  If the possible_bad_char parameter is set (to 1 or higher) then 
387  strings that would normally end up in a semicolon delimited blob will 
388  have a string inserted at the beginning to it. The string can be the 'p' 
389  argument to this function. [TODO] 
390  """ 
392   
393      preferred_quote='"' # This info should be in a more central spot 
394       
395      if pattern_eoline_etcet.search( text ): 
396          return semicolons_add( text ) 
397   
398      if pattern_single_qoute.search( text ): 
399          single_qoute_match = 1 
400      else: 
401          single_qoute_match = 0 
402   
403      if pattern_double_qoute.search( text ): 
404          double_qoute_match = 1 
405      else: 
406          double_qoute_match = 0 
407   
408      if single_qoute_match and double_qoute_match: 
409          return semicolons_add( text ) 
410   
411      if single_qoute_match: 
412          return '"' + text + '"' 
413      # Commented out because it leads to the same behaviour 
414      if double_qoute_match: 
415          return "'" + text + "'" 
416   
417      ## Space other than end of line, or # sign etc. 
418      return preferred_quote + text + preferred_quote 
419   
420   
421  "Strips quotes in pairs and returns new/old string" 
423   
424      ## Can it be containing quotes? 
425      if len(text) <= 1: 
426          return text 
427      for quote_symbol in [ "\'", '\"' ]: 
428          if ( text[0]  == quote_symbol and 
429               text[-1] == quote_symbol ): 
430              return text[1:-1] 
431      return text 
432   
433   
434  """ 
435  Returns the input with ; delimited, possibly with a string inserted at the beginning. 
436  The string value should always be ended by a eol, otherwise 
437  the second semicolon can not be the first char on a line. 
438  """ 
440      if possible_bad_char: 
441          lines       = text.split('\n') 
442          text   = '' 
443          for line in lines: 
444              text = text + prepending_string + line + '\n' 
445  ##        ## Code repeated for speed 
446  ##        return "\n;" + text + ";\n" 
447  ##    else: 
448  ##        return "\n;" + text + ";\n" 
449  #JFD updates 5/23/2006; apparently the text does not always end with an eol. 
450      if not text.endswith('\n'): 
451         text = text + '\n' 
452      return "\n;\n" + text + ";\n" 
453   
454  """ 
455  Strip the STAR comments new style 
456  """ 
457 -def comments_strip( text ): 
458      lines = text.split("\n" ) 
459      i=0 
460      count = 0 
461      ls = len(lines) 
462  #    print "DEBUG: processing lines: ", ls 
463      while i<ls: 
464  #        print "DEBUG: processing A line: ", i 
465          line = lines[i] 
466          # Scan past semi colon blocks. 
467          l = len(line) 
468          if l < 1: 
469  #            print "DEBUG: skipping empty line: " 
470              i += 1 
471              continue 
472          if line[0] == ';':                        # start a semicolon block 
473  #            print "DEBUG: found start of semi colon block." 
474              i += 1 
475              line = lines[i] 
476  #            print "DEBUG: processing B line: ", i 
477              while len(line)==0 or line[0] != ';': 
478                  i += 1 
479                  line = lines[i] 
480  #                print "DEBUG: processing C line: ", i 
481                                                      # end a semicolon block 
482          else: 
483              line = _comments_strip_line(line) 
484              if len(line) != l: 
485                  lines[i] = line 
486  #                print "Changed from lenght",l,"to line: ["+line+"] at:", i 
487                  count += 1 
488          i += 1 
489   
490      if verbosity >= 9: 
491          print('Done [%s] comment subs' % count) 
492      text = "\n".join(lines) 
493      return text 
494   
495  """ 
496  Strip the STAR comments for a single line. 
497  """ 
499      c=0 
500      state = FREE # like to start out free which is possible after donning semicolon blocks. 
501      l = len(line) 
502      while c < l: # parse range [0,n> where n is length and exclusive.         
503          ch=line[c] 
504  #        print "DEBUG: Processing char '"+ch+"' at "+`c`+" in state:", state 
505          if ( ch == sharp and state == FREE and    # A sharp in FREE state 
506                  (c==0 or line[c-1].isspace())):   # behind a space or at beginning of a line. 
507  #            print "DEBUG: Found sharpie" 
508              if c==0: 
509                  return '' 
510              return line[0:c] # this is fast. 
511          if c==l-1: # c is the last character; leave it alone if it's not a sharpie 
512              return line 
513           
514          if ch == doubleq: 
515              if (state == FREE and                # new " behind space or at beginning of line 
516                  (c==0 or line[c-1].isspace())): 
517                  state = DOUBLE 
518              elif state == DOUBLE: 
519                  if line[c+1].isspace(): # garanteed to exist now.                 
520                      state = FREE                 
521          elif ch == singleq: 
522              if (state == FREE and 
523                      (c==0 or line[c-1].isspace())): 
524                  state = SINGLE 
525              elif state == SINGLE: 
526                  if line[c+1].isspace():                     
527                      state = FREE            
528          c += 1 
529      return line 
530   
531  #def comments_stripOld( text ): 
532  #    # split for profiling 
533  #    text = _comments_strip1(text) 
534  #    text = _comments_strip2(text) 
535  #    return text 
536  # 
537  #def _comments_strip1( text ): 
538  #    text, count = pattern_comment_begin.subn( '', text ) 
539  #    if verbosity >= 9: 
540  #        print 'Done [%s] subs with comment at beginning of line' % count 
541  #    return text 
542  # 
543  #def _comments_strip2( text ): 
544  #    text, count = pattern_comment_middle.subn( '\g<1>', text ) 
545  #    if verbosity >= 9: 
546  #        print 'Done [%s] subs with comment not at beginning of line' % count 
547  #    return text 
548       
550   
551      text, count = pattern_nmrView_compress_empty.subn( '{}', text )     
552      print('Compressed [%s] nmrView empty { } tags' % count) 
553   
554      text, count = pattern_nmrView_compress_questionmark.subn( '{?}', text )     
555      print('Compressed [%s] nmrView question mark { ?} tags' % count) 
556       
557      return text 
558   
| Trees | Indices | Help | 
 | 
|---|
| Generated by Epydoc 3.0.1 on Fri Oct 28 15:38:42 2016 | http://epydoc.sourceforge.net |