| Trees | Indices | Help |
|
|---|
|
|
1 """
2 Classes for dealing with STAR syntax
3 """
4
5 __author__ = "$Author: jurgenfd $"
6 ___revision__ = "$Revision: 13 $"
7 ___date__ = "$Date: 2007-08-22 20:59:28 +0200 (Wed, 22 Aug 2007) $"
8
9 ## Standard modules
10 import re
11
12 """
13 Some handy patterns and functions for dealing with text in the STAR syntax.
14 Some are complicated because in Python the none-greedy pattern matching
15 gets too recursive and will actually bomb on larger strings. Like the
16 following code causes a bomb:
17 re.search( 'a.*?c', 'a' + 99999*'b' + 'c' )
18 Produces: 'RuntimeError: maximum recursion limit exceeded'
19 """
20
21 ## Since there are only functions and no classes in this module
22 ## the verbosity may be changed by changing the variable directly.
23 ## I know that's vague but I don't know how to do it yet... todo.
24 verbosity = 2
25
26 ## When not sure if text can have a ; at start of line use
27 ## this string prepended to each line.
28 prepending_string = '[raw] '
29
30 FREE = 0
31 SINGLE = 1
32 DOUBLE = 2
33 singleq = "'"
34 doubleq = '"'
35 sharp = '#'
36 space = ' '
37 ## Following string will be replacing the eol in a semicolon block where needed
38 ## It may not contain any funny characters and shouldn't have underscores
39 ## because it will make parsing slower. Parentheses, if used, should be of the
40 ## square type.
41 eol_string = '<eol-string>'
42 eol_string_length = len(eol_string)
43 # Redefined below curiously found this bug with code analysis from pydev extensions
44 # changing the wild import to specific import; that sounds like bad python if it matters.
45 #pattern_tagtable_loop = re.compile(r"""
46 #^\s* loop_ \s* # Begin of loop
47 #( ^\s* (?P<tagname>_\S+) \s*\n )+ # Tag names with some spaces
48 # (?P<rawtext>.+?) # Tag table raw text
49 #^\s* stop_ \s*\n # End of loop
50 # """, re.DOTALL | re.MULTILINE | re.VERBOSE )
51
52 pattern_semicolon_block = re.compile(r"""
53 ^; # semicolon at begin, any text and then eol
54 .+? # Raw text for match object but not greedy
55 ^; # semicolon at begin, that's it
56 """, re.DOTALL | re.MULTILINE | re.VERBOSE )
57
58 pattern_eol_string = re.compile( eol_string, re.MULTILINE )
59
60 ## Next pattern tells when search for on ONE tagvalue if it needs quotes
61 pattern_quotes_needed = re.compile( r'[\s\'\"]|^_|^\#' )
62
63 ## Next pattern tells when search for on MANY tagvalues if it needs quotes
64 ## The values should be joined by a comma. A value: 'bla,_bla' will be
65 ## mentioned as needing quotes unnecessarily but that's dealt with in the code by further checking
66 pattern_quotes_needed_2= re.compile( r'[\s\'\"]|^_|,_|,\#' )
67
68 pattern_eoline_etcet = re.compile( r'[\n\r\v\f]' )
69 # If the quote character is at the end of the word then it is falsely considered to need a
70 # different quote style; this happens frequently for e.g. H1' and all nucleic acid sugar atoms.
71 pattern_single_qoute = re.compile( r"'" )
72 pattern_double_qoute = re.compile( r'"' )
73
74 pattern_save_begin = re.compile('save_(\S+)\s+')
75 pattern_save_end = re.compile('save_\s*')
76 pattern_tagtable_loop = re.compile("loop_\s*" )
77 pattern_tagtable_stop = re.compile("stop_\s*" )
78 # Same thing but not eating all white space chars, just a minimal match
79 pattern_save_begin_nws = re.compile('save_\S')
80 # Pattern extended to include matches to "save_" as the last characters in a file.
81 # in other words; without a end of line.
82 pattern_save_end_nws = re.compile('(?:save_\s)|(?:save_$)')
83 #pattern_save_end_nws = re.compile('save_\s')
84 pattern_tagtable_loop_nws = re.compile('loop_\s')
85 pattern_tag_name_nws = re.compile('_\S')
86 # Same thing but requiring a prefixed white space char:
87 ##pattern_sf_begin_or_end = re.compile('\ssave_')
88 pattern_tagtable_loop_2 = re.compile('\sloop_\s+' )
89 pattern_tagtable_stop_2 = re.compile('\sstop_\s+' )
90 pattern_tagname_2 = re.compile('\s_\S+\s+' )
91
92 pattern_tag_name = re.compile(r"""(_\S+) \s+
93 """, re.DOTALL | re.MULTILINE | re.VERBOSE )
94 pattern_tags_loop = re.compile(r"""(?: (_\S+) \s* )+
95 """, re.MULTILINE | re.VERBOSE )
96 pattern_tags_loop_2 = re.compile(r""" (_\S+) \s*
97 """, re.MULTILINE | re.VERBOSE )
98
99 ## Get any number of non-white space characters followed by any white space
100 pattern_word = re.compile(r"""(\S+)\s*""", re.MULTILINE )
101
102 pattern_quoted = re.compile(r"""
103 ['"] | # single or double quote
104 (?: ^ ; ) # semicolon at the beginning of a line
105 """, re.MULTILINE | re.VERBOSE )
106
107 pattern_quoted_2 = re.compile(r"""(?: \b [\'\"] ) | (?: ^ \; )""", re.MULTILINE | re.VERBOSE )
108
109 pattern_s_quote = re.compile(r"""\'\s+""", re.MULTILINE )
110 pattern_d_quote = re.compile(r"""\"\s+""", re.MULTILINE )
111 pattern_e_semicolon = re.compile( eol_string + r"""\;\s*""", re.MULTILINE ) # Added \n for better parsing Wim 01/11/05
112
113 # Set beginning of line BEFORE whitespace - Wim 06/03/2003
114 #pattern_comment_begin = re.compile (r"""^\s*\#.*\n # A string starting a line with a sharp
115 # """, re.MULTILINE | re.VERBOSE)
116
117 pattern_nmrView_compress_empty = re.compile(r""" \{(\s+)\}
118 """, re.MULTILINE | re.VERBOSE)
119 pattern_nmrView_compress_questionmark = re.compile(r""" \{(\s+\?)\}
120 """, re.MULTILINE | re.VERBOSE)
121 # JFD old's
122 #pattern_comment_middle = re.compile (r"""(^[^;^\n] .*? ) # Any string beginning a line other than with a semicolon
123 # (\s \# .* $ ) # Any string ending a line and starting with a sharp
124 # """, re.MULTILINE | re.VERBOSE)
125
126 # Wim's:
127 #pattern_comment_middle = re.compile (
128 # r""" ( # start group 1 that will be captured for replay.
129 # ^[^;^\n] # not a what?
130 # (?: # start a non-capturing group
131 # ( # start group 2 (capturing?)
132 # [\'][^\']*\#[^\']*[\'] | # get '<text>#<text>'
133 # [\"][^\"]*\#[^\"]*[\"] # get "<text>#<text>"
134 # ) |
135 # [^\#.]
136 # )*?
137 # )
138 # # Any string beginning a line other than with a semicolon and with no quotes in it
139 # (\s+\#.*)? $ # the comment to be deleted.
140 # # Any string ending a line and starting with a sharp
141 # """, re.MULTILINE | re.VERBOSE)
142 # # Hashes in quotes don't count!
143 # # (?:[\'\"][^\'^\".]*\#[^\'^\".]*[\'\"]|[^\#.])*? ) expression gets '<text>#<text>' blocks,
144 # # is now built into multiline search, seems to be working... (Wim 11/02)
145 # # Changed \s* to \s+ - comments can only start with a ' ' before the '#' (Wim 05/03)
146 # # Removed . from [^\'^\".] in regular expression described above: more generic (Wim 05/03)
147 # doesn't catch"""H# # comment""" see testcomments_strip3a
148 # doesn't catch"""
149 #;
150 #foo # comment
151 #;"""
152
153
154 """
155 Searches for a regular expression in text.
156 The text may not be STAR quoted and must have semicolon blocks collapsed
157 such that the semicolon starts at the beginning of the line.
158 Returns the start position of the match or -1 if it was not found or
159 None if there was an error.
160
161 The function will search the text from given position onwards
162 and checks the chars preceding (up to the line it's in) for quote style.
163
164 WARNINGS:
165 - Don't call it for a text that has no \n and at least 1 other
166 character in it before pos (not fully tested; perhaps possible).
167 - I have not put in extra checks because of needed speed.
168 - No requirements set on what follows the pattern.
169 """
170
172 while True:
173 match = pattern.search( text, pos)
174 if not match:
175 ## No match at all
176 return -1
177
178 pos = match.start()
179
180 ## Is it the beginning of the string
181 if pos == 0:
182 return 0
183
184 ## Is the first character matched an eol it self
185 if text[pos]=='\n':
186 if verbosity >= 9:
187 print('Found pattern: [%s] at the beginning of a line' % pattern.pattern)
188 return pos
189
190 ## I hope the rfind is optimized to stroll backwards from pos
191 pos_end_of_previous_line = text.rfind('\n', 0, pos)
192 if pos_end_of_previous_line == -1:
193 pos_end_of_previous_line = -1 ## Dangerous rewind?
194
195 line = text[pos_end_of_previous_line+1:pos]
196 # Some dummy value but continue with the test below.
197 if line == '':
198 line = ' '
199
200 # Not the one
201 if line[0] == ';':
202 if verbosity > 9:
203 print('WARNING: (1) found pattern: [%s] preceded by: [%s]' % (
204 pattern.pattern, line ))
205 pos = pos + 1
206 continue
207
208 squoted = None
209 dquoted = None
210 for i in line:
211 if i == "'":
212 if not dquoted:
213 squoted = not squoted
214 elif i == '"':
215 if not squoted:
216 dquoted = not dquoted
217 if squoted or dquoted:
218 ## if squoted and dquoted:
219 ## ## Should not be possible to occur, delete when confident
220 ## print "ERROR: code error, mixing of quote styles in line:"
221 ## print "ERROR: [%s]" % line
222 ## return None
223 if verbosity > 1:
224 print('WARNING: (2) found pattern: [%s] preceded by: [%s]' % (
225 pattern.pattern, line ))
226
227 # Not the one
228 pos = pos + 1
229 continue
230
231 return pos
232
233
234 """
235 Parse one quoted tag value beginning from position: pos
236 Return the value and the position of the 'cursor' behind the
237 value for the first non white space char.
238 In case of error the position value of None will signal failure.
239 """
241 # print 'text: [%s]' % text[pos:pos+80]
242 # print 'pos: [%s]' % pos
243 if text[ pos ] == '"':
244 match_d_quote = pattern_d_quote.search( text, pos+1)
245 if not match_d_quote:
246 print("ERROR: No matching double quote char found for double quote char at offset:", 0)
247 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
248 return None, None
249 ## if verbosity >= 9:
250 ## print "pos, span():", pos, match_d_quote.span()
251 ## print 'Found Q tag value: [%s]' % text[ pos+1:match_d_quote.start() ]
252 return text[ pos+1:match_d_quote.start() ], match_d_quote.end()
253
254 if text[ pos ] == "'":
255 match_s_quote = pattern_s_quote.search( text, pos+1)
256 if not match_s_quote:
257 print("ERROR: No matching single quote char found for single quote char at offset:", 0)
258 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
259 return None, None
260 value = text[ pos+1:match_s_quote.start() ]
261 ## if verbosity >= 9:
262 ## print "pos, span():", pos, match_s_quote.span()
263 ## print 'Found Q tag value: [%s]' % value
264 return value, match_s_quote.end()
265
266 ## Remove check for speed if you want
267 ## This should always be true
268 if text[ pos ] == ";":
269 match_e_semicolon = pattern_e_semicolon.search( text, pos+1)
270 if not match_e_semicolon:
271 print("ERROR: No matching semicolon found for semicolon char at offset:", 0)
272 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
273 return None, None
274 ## print "pos, span():", pos, match_e_semicolon.span()
275 ## Include the first eol and the eol before the semicolon
276 value = text[ pos+1:match_e_semicolon.start()+eol_string_length ]
277 ## Expansion relatively cheap here and harmless if unique string as defined in
278 ## eol_string is indeed unique
279 ## print 'Found Q (semicolon) tag value: unexpanded [%s]' % value
280 ## print '-----------'
281 ## print text[ match_e_semicolon.start()+eol_string_length : match_e_semicolon.start()+eol_string_length + 20]
282 ## print '-----------'
283 value = semicolon_block_expand( value )
284 ## print 'Found Q (semicolon) tag value: expanded [%s]' % value
285
286 return value, match_e_semicolon.end()
287
288 print("ERROR: Position in text:", pos)
289 print("""ERROR: should contain a ', ", or a ; but was not found:""")
290 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
291 return None, None
292
293
294 """
295 From text on position pos, read a tag value and return the value and
296 position of the next non-space char. This is the slow parsing method
297 that should only be used for free tags.
298 """
300
301 match_quoted = pattern_quoted.search( text, pos )
302 if match_quoted:
303 if match_quoted.start() == pos:
304
305 return tag_value_quoted_parse( text, pos ) # Better speed with this code
306
307 match_word = pattern_word.search( text, pos )
308 if not match_word:
309 print("ERROR: No match for a 'word' at offset:", pos)
310 print("ERROR: Next 70 chars are:", text[ pos:pos+70 ])
311 return None, None
312 if match_word.start() != pos:
313 print("ERROR: Match for a 'word' at wrong offset:", match_word.start() - pos)
314 print("ERROR: Next 70 chars are:", text[ pos:pos+70 ])
315 return None, None
316
317 ## Include the first eol and the eol before the semicolon
318 return match_word.group(1), match_word.end()
319
320
321
322 """
323 See function semicolon_block_collapse that calls this one
324 """
328
329
330 """
331 This function should be called (not semicolon_block_replace)
332 Putting all semicolon separated values on one line
333 by replacing the eol within with a unique key value
334 that is to be remove later on by it's sibling method:
335 semicolon_block_expand.
336 SPEED: 0.6 cpu seconds for a 5 Mb file with 31 blocks and
337 1.3 " 10 " 64 ".
338 """
340
341 count = 0
342 startpos = 0
343
344 # TODO: this is not good - since text[startpos:] is used it's always the start of a line, so if string starts with ;...
345 pattern_semicolon_only = re.compile("^\;", re.MULTILINE)
346 pattern_semicolon_only_end = re.compile("(^\;\s*)", re.MULTILINE)
347
348 semicolon_start = pattern_semicolon_only.search(text[startpos:])
349
350 while(semicolon_start):
351
352 count += 1
353
354 startpos = startpos + semicolon_start.start()
355 semicolon_end = pattern_semicolon_only_end.search(text[startpos+1:])
356 try:
357 endpos = startpos + 1 + semicolon_end.end() - len(semicolon_end.group(1)) + 1
358 except:
359 print("ERROR in semicolon_block_collapse for text starting at: ["+ text[startpos:startpos+100]+ "]")
360 raise
361
362 text_replace = re.sub("\n", eol_string, text[startpos:endpos])
363
364 # This is bulky and not very elegant but works
365 text= text[0:startpos] + text_replace + text[endpos:]
366
367 startpos = startpos + len(text_replace)
368
369 semicolon_start = pattern_semicolon_only.search(text[startpos:])
370
371 # Original code: can't handle re matches that are too long
372 #text, count = pattern_semicolon_block.subn( semicolon_block_replace, text )
373 if verbosity >= 9:
374 print('Done [%s] subs with semicolon blocks' % count)
375 return text
376
378 return pattern_eol_string.sub('\n', text )
379
380 """
381 Adds semicolons, single quotes or double quotes depending on
382 need according to star syntax.
383 Does not assume that no quotes exist initially and will strip them if
384 present in pairs only.
385
386 If the possible_bad_char parameter is set (to 1 or higher) then
387 strings that would normally end up in a semicolon delimited blob will
388 have a string inserted at the beginning to it. The string can be the 'p'
389 argument to this function. [TODO]
390 """
392
393 preferred_quote='"' # This info should be in a more central spot
394
395 if pattern_eoline_etcet.search( text ):
396 return semicolons_add( text )
397
398 if pattern_single_qoute.search( text ):
399 single_qoute_match = 1
400 else:
401 single_qoute_match = 0
402
403 if pattern_double_qoute.search( text ):
404 double_qoute_match = 1
405 else:
406 double_qoute_match = 0
407
408 if single_qoute_match and double_qoute_match:
409 return semicolons_add( text )
410
411 if single_qoute_match:
412 return '"' + text + '"'
413 # Commented out because it leads to the same behaviour
414 if double_qoute_match:
415 return "'" + text + "'"
416
417 ## Space other than end of line, or # sign etc.
418 return preferred_quote + text + preferred_quote
419
420
421 "Strips quotes in pairs and returns new/old string"
423
424 ## Can it be containing quotes?
425 if len(text) <= 1:
426 return text
427 for quote_symbol in [ "\'", '\"' ]:
428 if ( text[0] == quote_symbol and
429 text[-1] == quote_symbol ):
430 return text[1:-1]
431 return text
432
433
434 """
435 Returns the input with ; delimited, possibly with a string inserted at the beginning.
436 The string value should always be ended by a eol, otherwise
437 the second semicolon can not be the first char on a line.
438 """
440 if possible_bad_char:
441 lines = text.split('\n')
442 text = ''
443 for line in lines:
444 text = text + prepending_string + line + '\n'
445 ## ## Code repeated for speed
446 ## return "\n;" + text + ";\n"
447 ## else:
448 ## return "\n;" + text + ";\n"
449 #JFD updates 5/23/2006; apparently the text does not always end with an eol.
450 if not text.endswith('\n'):
451 text = text + '\n'
452 return "\n;\n" + text + ";\n"
453
454 """
455 Strip the STAR comments new style
456 """
457 -def comments_strip( text ):
458 lines = text.split("\n" )
459 i=0
460 count = 0
461 ls = len(lines)
462 # print "DEBUG: processing lines: ", ls
463 while i<ls:
464 # print "DEBUG: processing A line: ", i
465 line = lines[i]
466 # Scan past semi colon blocks.
467 l = len(line)
468 if l < 1:
469 # print "DEBUG: skipping empty line: "
470 i += 1
471 continue
472 if line[0] == ';': # start a semicolon block
473 # print "DEBUG: found start of semi colon block."
474 i += 1
475 line = lines[i]
476 # print "DEBUG: processing B line: ", i
477 while len(line)==0 or line[0] != ';':
478 i += 1
479 line = lines[i]
480 # print "DEBUG: processing C line: ", i
481 # end a semicolon block
482 else:
483 line = _comments_strip_line(line)
484 if len(line) != l:
485 lines[i] = line
486 # print "Changed from lenght",l,"to line: ["+line+"] at:", i
487 count += 1
488 i += 1
489
490 if verbosity >= 9:
491 print('Done [%s] comment subs' % count)
492 text = "\n".join(lines)
493 return text
494
495 """
496 Strip the STAR comments for a single line.
497 """
499 c=0
500 state = FREE # like to start out free which is possible after donning semicolon blocks.
501 l = len(line)
502 while c < l: # parse range [0,n> where n is length and exclusive.
503 ch=line[c]
504 # print "DEBUG: Processing char '"+ch+"' at "+`c`+" in state:", state
505 if ( ch == sharp and state == FREE and # A sharp in FREE state
506 (c==0 or line[c-1].isspace())): # behind a space or at beginning of a line.
507 # print "DEBUG: Found sharpie"
508 if c==0:
509 return ''
510 return line[0:c] # this is fast.
511 if c==l-1: # c is the last character; leave it alone if it's not a sharpie
512 return line
513
514 if ch == doubleq:
515 if (state == FREE and # new " behind space or at beginning of line
516 (c==0 or line[c-1].isspace())):
517 state = DOUBLE
518 elif state == DOUBLE:
519 if line[c+1].isspace(): # garanteed to exist now.
520 state = FREE
521 elif ch == singleq:
522 if (state == FREE and
523 (c==0 or line[c-1].isspace())):
524 state = SINGLE
525 elif state == SINGLE:
526 if line[c+1].isspace():
527 state = FREE
528 c += 1
529 return line
530
531 #def comments_stripOld( text ):
532 # # split for profiling
533 # text = _comments_strip1(text)
534 # text = _comments_strip2(text)
535 # return text
536 #
537 #def _comments_strip1( text ):
538 # text, count = pattern_comment_begin.subn( '', text )
539 # if verbosity >= 9:
540 # print 'Done [%s] subs with comment at beginning of line' % count
541 # return text
542 #
543 #def _comments_strip2( text ):
544 # text, count = pattern_comment_middle.subn( '\g<1>', text )
545 # if verbosity >= 9:
546 # print 'Done [%s] subs with comment not at beginning of line' % count
547 # return text
548
550
551 text, count = pattern_nmrView_compress_empty.subn( '{}', text )
552 print('Compressed [%s] nmrView empty { } tags' % count)
553
554 text, count = pattern_nmrView_compress_questionmark.subn( '{?}', text )
555 print('Compressed [%s] nmrView question mark { ?} tags' % count)
556
557 return text
558
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Tue Nov 26 19:05:27 2013 | http://epydoc.sourceforge.net |