1 """
2 Classes for dealing with STAR syntax
3 """
4
5 __author__ = "$Author: jurgenfd $"
6 ___revision__ = "$Revision: 13 $"
7 ___date__ = "$Date: 2007-08-22 20:59:28 +0200 (Wed, 22 Aug 2007) $"
8
9
10 import re
11
12 """
13 Some handy patterns and functions for dealing with text in the STAR syntax.
14 Some are complicated because in Python the none-greedy pattern matching
15 gets too recursive and will actually bomb on larger strings. Like the
16 following code causes a bomb:
17 re.search( 'a.*?c', 'a' + 99999*'b' + 'c' )
18 Produces: 'RuntimeError: maximum recursion limit exceeded'
19 """
20
21
22
23
24 verbosity = 2
25
26
27
28 prepending_string = '[raw] '
29
30 FREE = 0
31 SINGLE = 1
32 DOUBLE = 2
33 singleq = "'"
34 doubleq = '"'
35 sharp = '#'
36 space = ' '
37
38
39
40
41 eol_string = '<eol-string>'
42 eol_string_length = len(eol_string)
43
44
45
46
47
48
49
50
51
52 pattern_semicolon_block = re.compile(r"""
53 ^; # semicolon at begin, any text and then eol
54 .+? # Raw text for match object but not greedy
55 ^; # semicolon at begin, that's it
56 """, re.DOTALL | re.MULTILINE | re.VERBOSE )
57
58 pattern_eol_string = re.compile( eol_string, re.MULTILINE )
59
60
61 pattern_quotes_needed = re.compile( r'[\s\'\"]|^_|^\#' )
62
63
64
65
66 pattern_quotes_needed_2= re.compile( r'[\s\'\"]|^_|,_|,\#' )
67
68 pattern_eoline_etcet = re.compile( r'[\n\r\v\f]' )
69
70
71 pattern_single_qoute = re.compile( r"'" )
72 pattern_double_qoute = re.compile( r'"' )
73
74 pattern_save_begin = re.compile('save_(\S+)\s+')
75 pattern_save_end = re.compile('save_\s*')
76 pattern_tagtable_loop = re.compile("loop_\s*" )
77 pattern_tagtable_stop = re.compile("stop_\s*" )
78
79 pattern_save_begin_nws = re.compile('save_\S')
80
81
82 pattern_save_end_nws = re.compile('(?:save_\s)|(?:save_$)')
83
84 pattern_tagtable_loop_nws = re.compile('loop_\s')
85 pattern_tag_name_nws = re.compile('_\S')
86
87
88 pattern_tagtable_loop_2 = re.compile('\sloop_\s+' )
89 pattern_tagtable_stop_2 = re.compile('\sstop_\s+' )
90 pattern_tagname_2 = re.compile('\s_\S+\s+' )
91
92 pattern_tag_name = re.compile(r"""(_\S+) \s+
93 """, re.DOTALL | re.MULTILINE | re.VERBOSE )
94 pattern_tags_loop = re.compile(r"""(?: (_\S+) \s* )+
95 """, re.MULTILINE | re.VERBOSE )
96 pattern_tags_loop_2 = re.compile(r""" (_\S+) \s*
97 """, re.MULTILINE | re.VERBOSE )
98
99
100 pattern_word = re.compile(r"""(\S+)\s*""", re.MULTILINE )
101
102 pattern_quoted = re.compile(r"""
103 ['"] | # single or double quote
104 (?: ^ ; ) # semicolon at the beginning of a line
105 """, re.MULTILINE | re.VERBOSE )
106
107 pattern_quoted_2 = re.compile(r"""(?: \b [\'\"] ) | (?: ^ \; )""", re.MULTILINE | re.VERBOSE )
108
109 pattern_s_quote = re.compile(r"""\'\s+""", re.MULTILINE )
110 pattern_d_quote = re.compile(r"""\"\s+""", re.MULTILINE )
111 pattern_e_semicolon = re.compile( eol_string + r"""\;\s*""", re.MULTILINE )
112
113
114
115
116
117 pattern_nmrView_compress_empty = re.compile(r""" \{(\s+)\}
118 """, re.MULTILINE | re.VERBOSE)
119 pattern_nmrView_compress_questionmark = re.compile(r""" \{(\s+\?)\}
120 """, re.MULTILINE | re.VERBOSE)
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154 """
155 Searches for a regular expression in text.
156 The text may not be STAR quoted and must have semicolon blocks collapsed
157 such that the semicolon starts at the beginning of the line.
158 Returns the start position of the match or -1 if it was not found or
159 None if there was an error.
160
161 The function will search the text from given position onwards
162 and checks the chars preceding (up to the line it's in) for quote style.
163
164 WARNINGS:
165 - Don't call it for a text that has no \n and at least 1 other
166 character in it before pos (not fully tested; perhaps possible).
167 - I have not put in extra checks because of needed speed.
168 - No requirements set on what follows the pattern.
169 """
170
172 while True:
173 match = pattern.search( text, pos)
174 if not match:
175
176 return -1
177
178 pos = match.start()
179
180
181 if pos == 0:
182 return 0
183
184
185 if text[pos]=='\n':
186 if verbosity >= 9:
187 print('Found pattern: [%s] at the beginning of a line' % pattern.pattern)
188 return pos
189
190
191 pos_end_of_previous_line = text.rfind('\n', 0, pos)
192 if pos_end_of_previous_line == -1:
193 pos_end_of_previous_line = -1
194
195 line = text[pos_end_of_previous_line+1:pos]
196
197 if line == '':
198 line = ' '
199
200
201 if line[0] == ';':
202 if verbosity > 9:
203 print('WARNING: (1) found pattern: [%s] preceded by: [%s]' % (
204 pattern.pattern, line ))
205 pos = pos + 1
206 continue
207
208 squoted = None
209 dquoted = None
210 for i in line:
211 if i == "'":
212 if not dquoted:
213 squoted = not squoted
214 elif i == '"':
215 if not squoted:
216 dquoted = not dquoted
217 if squoted or dquoted:
218
219
220
221
222
223 if verbosity > 1:
224 print('WARNING: (2) found pattern: [%s] preceded by: [%s]' % (
225 pattern.pattern, line ))
226
227
228 pos = pos + 1
229 continue
230
231 return pos
232
233
234 """
235 Parse one quoted tag value beginning from position: pos
236 Return the value and the position of the 'cursor' behind the
237 value for the first non white space char.
238 In case of error the position value of None will signal failure.
239 """
241
242
243 if text[ pos ] == '"':
244 match_d_quote = pattern_d_quote.search( text, pos+1)
245 if not match_d_quote:
246 print("ERROR: No matching double quote char found for double quote char at offset:", 0)
247 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
248 return None, None
249
250
251
252 return text[ pos+1:match_d_quote.start() ], match_d_quote.end()
253
254 if text[ pos ] == "'":
255 match_s_quote = pattern_s_quote.search( text, pos+1)
256 if not match_s_quote:
257 print("ERROR: No matching single quote char found for single quote char at offset:", 0)
258 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
259 return None, None
260 value = text[ pos+1:match_s_quote.start() ]
261
262
263
264 return value, match_s_quote.end()
265
266
267
268 if text[ pos ] == ";":
269 match_e_semicolon = pattern_e_semicolon.search( text, pos+1)
270 if not match_e_semicolon:
271 print("ERROR: No matching semicolon found for semicolon char at offset:", 0)
272 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
273 return None, None
274
275
276 value = text[ pos+1:match_e_semicolon.start()+eol_string_length ]
277
278
279
280
281
282
283 value = semicolon_block_expand( value )
284
285
286 return value, match_e_semicolon.end()
287
288 print("ERROR: Position in text:", pos)
289 print("""ERROR: should contain a ', ", or a ; but was not found:""")
290 print("ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ])
291 return None, None
292
293
294 """
295 From text on position pos, read a tag value and return the value and
296 position of the next non-space char. This is the slow parsing method
297 that should only be used for free tags.
298 """
300
301 match_quoted = pattern_quoted.search( text, pos )
302 if match_quoted:
303 if match_quoted.start() == pos:
304
305 return tag_value_quoted_parse( text, pos )
306
307 match_word = pattern_word.search( text, pos )
308 if not match_word:
309 print("ERROR: No match for a 'word' at offset:", pos)
310 print("ERROR: Next 70 chars are:", text[ pos:pos+70 ])
311 return None, None
312 if match_word.start() != pos:
313 print("ERROR: Match for a 'word' at wrong offset:", match_word.start() - pos)
314 print("ERROR: Next 70 chars are:", text[ pos:pos+70 ])
315 return None, None
316
317
318 return match_word.group(1), match_word.end()
319
320
321
322 """
323 See function semicolon_block_collapse that calls this one
324 """
326
327 return re.sub( '\n', eol_string, matchobj.group() )
328
329
330 """
331 This function should be called (not semicolon_block_replace)
332 Putting all semicolon separated values on one line
333 by replacing the eol within with a unique key value
334 that is to be remove later on by it's sibling method:
335 semicolon_block_expand.
336 SPEED: 0.6 cpu seconds for a 5 Mb file with 31 blocks and
337 1.3 " 10 " 64 ".
338 """
340
341 count = 0
342 startpos = 0
343
344
345 pattern_semicolon_only = re.compile("^\;", re.MULTILINE)
346 pattern_semicolon_only_end = re.compile("(^\;\s*)", re.MULTILINE)
347
348 semicolon_start = pattern_semicolon_only.search(text[startpos:])
349
350 while(semicolon_start):
351
352 count += 1
353
354 startpos = startpos + semicolon_start.start()
355 semicolon_end = pattern_semicolon_only_end.search(text[startpos+1:])
356 try:
357 endpos = startpos + 1 + semicolon_end.end() - len(semicolon_end.group(1)) + 1
358 except:
359 print("ERROR in semicolon_block_collapse for text starting at: ["+ text[startpos:startpos+100]+ "]")
360 raise
361
362 text_replace = re.sub("\n", eol_string, text[startpos:endpos])
363
364
365 text= text[0:startpos] + text_replace + text[endpos:]
366
367 startpos = startpos + len(text_replace)
368
369 semicolon_start = pattern_semicolon_only.search(text[startpos:])
370
371
372
373 if verbosity >= 9:
374 print('Done [%s] subs with semicolon blocks' % count)
375 return text
376
378 return pattern_eol_string.sub('\n', text )
379
380 """
381 Adds semicolons, single quotes or double quotes depending on
382 need according to star syntax.
383 Does not assume that no quotes exist initially and will strip them if
384 present in pairs only.
385
386 If the possible_bad_char parameter is set (to 1 or higher) then
387 strings that would normally end up in a semicolon delimited blob will
388 have a string inserted at the beginning to it. The string can be the 'p'
389 argument to this function. [TODO]
390 """
419
420
421 "Strips quotes in pairs and returns new/old string"
423
424
425 if len(text) <= 1:
426 return text
427 for quote_symbol in [ "\'", '\"' ]:
428 if ( text[0] == quote_symbol and
429 text[-1] == quote_symbol ):
430 return text[1:-1]
431 return text
432
433
434 """
435 Returns the input with ; delimited, possibly with a string inserted at the beginning.
436 The string value should always be ended by a eol, otherwise
437 the second semicolon can not be the first char on a line.
438 """
453
454 """
455 Strip the STAR comments new style
456 """
494
495 """
496 Strip the STAR comments for a single line.
497 """
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
558