7c1d642054b5c867620293497d898a9645c48322
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from soc
.decoder
.selectable_int
import SelectableInt
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38 def python_colonify(lexer
, tokens
):
40 fake_colon_needed
= False
42 #print ("track colon token", token, token.type)
44 if token
.type == 'THEN':
45 # turn then into colon
48 elif token
.type == 'ELSE':
53 elif token
.type in ['DO', 'WHILE', 'FOR', 'SWITCH', 'CASE', 'DEFAULT']:
54 fake_colon_needed
= True
56 elif token
.type == 'NEWLINE':
61 fake_colon_needed
= False
67 # only care about whitespace at the start of a line
68 def track_tokens_filter(lexer
, tokens
):
69 oldignore
= lexer
.lexignore
70 lexer
.at_line_start
= at_line_start
= True
74 #print ("track token", token, token.type)
75 token
.at_line_start
= at_line_start
77 if token
.type == "COLON":
80 token
.must_indent
= False
82 elif token
.type == "NEWLINE":
84 if indent
== MAY_INDENT
:
86 token
.must_indent
= False
88 elif token
.type == "WS":
89 assert token
.at_line_start
== True
91 token
.must_indent
= False
94 # A real token; only indent after COLON NEWLINE
95 if indent
== MUST_INDENT
:
96 token
.must_indent
= True
98 token
.must_indent
= False
102 # really bad hack that changes ignore lexer state.
103 # when "must indent" is seen (basically "real tokens" seen)
104 # then ignore whitespace.
105 if token
.must_indent
:
106 lexer
.lexignore
= ('ignore', ' ')
108 lexer
.lexignore
= oldignore
110 token
.indent
= indent
112 lexer
.at_line_start
= at_line_start
114 def _new_token(type, lineno
):
122 # Synthesize a DEDENT tag
124 return _new_token("DEDENT", lineno
)
126 # Synthesize an INDENT tag
128 return _new_token("INDENT", lineno
)
131 # Track the indentation level and emit the right INDENT / DEDENT events.
132 def indentation_filter(tokens
):
133 # A stack of indentation levels; will never pop item 0
140 print ("Process", depth
, token
.indent
, token
,)
141 if token
.at_line_start
:
142 print ("at_line_start",)
143 if token
.must_indent
:
144 print ("must_indent",)
147 # WS only occurs at the start of the line
148 # There may be WS followed by NEWLINE so
149 # only track the depth here. Don't indent/dedent
150 # until there's something real.
151 if token
.type == "WS":
153 depth
= len(token
.value
)
155 # WS tokens are never passed to the parser
158 if token
.type == "NEWLINE":
160 if prev_was_ws
or token
.at_line_start
:
163 # pass the other cases on through
167 # then it must be a real token (not WS, not NEWLINE)
168 # which can affect the indentation level
171 if token
.must_indent
:
172 # The current depth must be larger than the previous level
173 if not (depth
> levels
[-1]):
174 raise IndentationError("expected an indented block")
177 yield INDENT(token
.lineno
)
179 elif token
.at_line_start
:
180 # Must be on the same level or one of the previous levels
181 if depth
== levels
[-1]:
184 elif depth
> levels
[-1]:
185 raise IndentationError("indent increase but not in new block")
187 # Back up; but only if it matches a previous level
189 i
= levels
.index(depth
)
191 raise IndentationError("inconsistent indentation")
192 for _
in range(i
+1, len(levels
)):
193 yield DEDENT(token
.lineno
)
198 ### Finished processing ###
200 # Must dedent any remaining levels
202 assert token
is not None
203 for _
in range(1, len(levels
)):
204 yield DEDENT(token
.lineno
)
207 # The top-level filter adds an ENDMARKER, if requested.
208 # Python's grammar uses it.
209 def filter(lexer
, add_endmarker
= True):
211 tokens
= iter(lexer
.token
, None)
212 tokens
= python_colonify(lexer
, tokens
)
213 tokens
= track_tokens_filter(lexer
, tokens
)
214 for token
in indentation_filter(tokens
):
219 if token
is not None:
220 lineno
= token
.lineno
221 yield _new_token("ENDMARKER", lineno
)
237 'NUMBER', # Python decimals
238 'BINARY', # Python binary
239 'STRING', # single quoted strings only; syntax of raw strings
279 def build(self
,**kwargs
):
280 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
282 def t_BINARY(self
, t
):
284 t
.value
= SelectableInt(int(t
.value
, 2), len(t
.value
)-2)
288 # taken from decmial.py but without the leading sign
289 def t_NUMBER(self
, t
):
290 r
"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
291 t
.value
= int(t
.value
)
294 def t_STRING(self
, t
):
295 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
296 print (repr(t
.value
))
297 t
.value
=t
.value
[1:-1]
302 t_ASSIGNEA
= r
'<-iea'
324 # Ply nicely documented how to do this.
339 "default": "DEFAULT",
343 r
'[a-zA-Z_][a-zA-Z0-9_]*'
344 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
347 # Putting this before t_WS let it consume lines with only comments in
348 # them so the latter code never sees the WS part. Not consuming the
349 # newline. Needed for "if 1: #comment"
350 def t_comment(self
, t
):
351 r
"[ ]*\043[^\n]*" # \043 is '#'
358 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
359 t
.lexer
.brack_count
== 0:
362 # Don't generate newline tokens when inside of parenthesis, eg
365 def t_newline(self
, t
):
367 t
.lexer
.lineno
+= len(t
.value
)
369 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
372 def t_LBRACK(self
, t
):
374 t
.lexer
.brack_count
+= 1
377 def t_RBRACK(self
, t
):
379 # check for underflow? should be the job of the parser
380 t
.lexer
.brack_count
-= 1
385 t
.lexer
.paren_count
+= 1
390 # check for underflow? should be the job of the parser
391 t
.lexer
.paren_count
-= 1
396 def t_error(self
, t
):
397 raise SyntaxError("Unknown symbol %r" % (t
.value
[0],))
398 print ("Skipping", repr(t
.value
[0]))
402 # Combine Ply and my filters into a new lexer
404 class IndentLexer(PowerLexer
):
405 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
406 self
.build(debug
=debug
, optimize
=optimize
,
407 lextab
=lextab
, reflags
=reflags
)
408 self
.token_stream
= None
410 def input(self
, s
, add_endmarker
=True):
411 self
.lexer
.paren_count
= 0
412 self
.lexer
.brack_count
= 0
414 self
.token_stream
= filter(self
.lexer
, add_endmarker
)
418 return next(self
.token_stream
)
419 except StopIteration:
422 if __name__
== '__main__':
428 if (RS)[63-n] = 0b1 then
437 lexer
= IndentLexer(debug
=1)
438 # Give the lexer some input