7c1d642054b5c867620293497d898a9645c48322
[soc.git] / src / soc / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from soc.decoder.selectable_int import SelectableInt
14
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
16
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
19
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
25
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
30 NO_INDENT = 0
31 MAY_INDENT = 1
32 MUST_INDENT = 2
33
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38 def python_colonify(lexer, tokens):
39
40 fake_colon_needed = False
41 for token in tokens:
42 #print ("track colon token", token, token.type)
43
44 if token.type == 'THEN':
45 # turn then into colon
46 token.type = "COLON"
47 yield token
48 elif token.type == 'ELSE':
49 yield token
50 token = copy(token)
51 token.type = "COLON"
52 yield token
53 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH', 'CASE', 'DEFAULT']:
54 fake_colon_needed = True
55 yield token
56 elif token.type == 'NEWLINE':
57 if fake_colon_needed:
58 ctok = copy(token)
59 ctok.type = "COLON"
60 yield ctok
61 fake_colon_needed = False
62 yield token
63 else:
64 yield token
65
66
67 # only care about whitespace at the start of a line
68 def track_tokens_filter(lexer, tokens):
69 oldignore = lexer.lexignore
70 lexer.at_line_start = at_line_start = True
71 indent = NO_INDENT
72 saw_colon = False
73 for token in tokens:
74 #print ("track token", token, token.type)
75 token.at_line_start = at_line_start
76
77 if token.type == "COLON":
78 at_line_start = False
79 indent = MAY_INDENT
80 token.must_indent = False
81
82 elif token.type == "NEWLINE":
83 at_line_start = True
84 if indent == MAY_INDENT:
85 indent = MUST_INDENT
86 token.must_indent = False
87
88 elif token.type == "WS":
89 assert token.at_line_start == True
90 at_line_start = True
91 token.must_indent = False
92
93 else:
94 # A real token; only indent after COLON NEWLINE
95 if indent == MUST_INDENT:
96 token.must_indent = True
97 else:
98 token.must_indent = False
99 at_line_start = False
100 indent = NO_INDENT
101
102 # really bad hack that changes ignore lexer state.
103 # when "must indent" is seen (basically "real tokens" seen)
104 # then ignore whitespace.
105 if token.must_indent:
106 lexer.lexignore = ('ignore', ' ')
107 else:
108 lexer.lexignore = oldignore
109
110 token.indent = indent
111 yield token
112 lexer.at_line_start = at_line_start
113
114 def _new_token(type, lineno):
115 tok = lex.LexToken()
116 tok.type = type
117 tok.value = None
118 tok.lineno = lineno
119 tok.lexpos = -1
120 return tok
121
122 # Synthesize a DEDENT tag
123 def DEDENT(lineno):
124 return _new_token("DEDENT", lineno)
125
126 # Synthesize an INDENT tag
127 def INDENT(lineno):
128 return _new_token("INDENT", lineno)
129
130
131 # Track the indentation level and emit the right INDENT / DEDENT events.
132 def indentation_filter(tokens):
133 # A stack of indentation levels; will never pop item 0
134 levels = [0]
135 token = None
136 depth = 0
137 prev_was_ws = False
138 for token in tokens:
139 if 0:
140 print ("Process", depth, token.indent, token,)
141 if token.at_line_start:
142 print ("at_line_start",)
143 if token.must_indent:
144 print ("must_indent",)
145 print
146
147 # WS only occurs at the start of the line
148 # There may be WS followed by NEWLINE so
149 # only track the depth here. Don't indent/dedent
150 # until there's something real.
151 if token.type == "WS":
152 assert depth == 0
153 depth = len(token.value)
154 prev_was_ws = True
155 # WS tokens are never passed to the parser
156 continue
157
158 if token.type == "NEWLINE":
159 depth = 0
160 if prev_was_ws or token.at_line_start:
161 # ignore blank lines
162 continue
163 # pass the other cases on through
164 yield token
165 continue
166
167 # then it must be a real token (not WS, not NEWLINE)
168 # which can affect the indentation level
169
170 prev_was_ws = False
171 if token.must_indent:
172 # The current depth must be larger than the previous level
173 if not (depth > levels[-1]):
174 raise IndentationError("expected an indented block")
175
176 levels.append(depth)
177 yield INDENT(token.lineno)
178
179 elif token.at_line_start:
180 # Must be on the same level or one of the previous levels
181 if depth == levels[-1]:
182 # At the same level
183 pass
184 elif depth > levels[-1]:
185 raise IndentationError("indent increase but not in new block")
186 else:
187 # Back up; but only if it matches a previous level
188 try:
189 i = levels.index(depth)
190 except ValueError:
191 raise IndentationError("inconsistent indentation")
192 for _ in range(i+1, len(levels)):
193 yield DEDENT(token.lineno)
194 levels.pop()
195
196 yield token
197
198 ### Finished processing ###
199
200 # Must dedent any remaining levels
201 if len(levels) > 1:
202 assert token is not None
203 for _ in range(1, len(levels)):
204 yield DEDENT(token.lineno)
205
206
207 # The top-level filter adds an ENDMARKER, if requested.
208 # Python's grammar uses it.
209 def filter(lexer, add_endmarker = True):
210 token = None
211 tokens = iter(lexer.token, None)
212 tokens = python_colonify(lexer, tokens)
213 tokens = track_tokens_filter(lexer, tokens)
214 for token in indentation_filter(tokens):
215 yield token
216
217 if add_endmarker:
218 lineno = 1
219 if token is not None:
220 lineno = token.lineno
221 yield _new_token("ENDMARKER", lineno)
222
223 ##### Lexer ######
224
225 class PowerLexer:
226 tokens = (
227 'DEF',
228 'IF',
229 'THEN',
230 'ELSE',
231 'FOR',
232 'TO',
233 'DO',
234 'WHILE',
235 'BREAK',
236 'NAME',
237 'NUMBER', # Python decimals
238 'BINARY', # Python binary
239 'STRING', # single quoted strings only; syntax of raw strings
240 'LPAR',
241 'RPAR',
242 'LBRACK',
243 'RBRACK',
244 'COLON',
245 'EQ',
246 'ASSIGNEA',
247 'ASSIGN',
248 'LTU',
249 'GTU',
250 'NE',
251 'LE',
252 'GE',
253 'LT',
254 'GT',
255 'PLUS',
256 'MINUS',
257 'MULT',
258 'DIV',
259 'MOD',
260 'INVERT',
261 'APPEND',
262 'BITOR',
263 'BITAND',
264 'BITXOR',
265 'RETURN',
266 'SWITCH',
267 'CASE',
268 'DEFAULT',
269 'WS',
270 'NEWLINE',
271 'COMMA',
272 'SEMICOLON',
273 'INDENT',
274 'DEDENT',
275 'ENDMARKER',
276 )
277
278 # Build the lexer
279 def build(self,**kwargs):
280 self.lexer = lex.lex(module=self, **kwargs)
281
282 def t_BINARY(self, t):
283 r"""0b[01]+"""
284 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
285 return t
286
287 #t_NUMBER = r'\d+'
288 # taken from decmial.py but without the leading sign
289 def t_NUMBER(self, t):
290 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
291 t.value = int(t.value)
292 return t
293
294 def t_STRING(self, t):
295 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
296 print (repr(t.value))
297 t.value=t.value[1:-1]
298 return t
299
300 t_COLON = r':'
301 t_EQ = r'='
302 t_ASSIGNEA = r'<-iea'
303 t_ASSIGN = r'<-'
304 t_LTU = r'<u'
305 t_GTU = r'>u'
306 t_NE = r'!='
307 t_LE = r'<='
308 t_GE = r'>='
309 t_LT = r'<'
310 t_GT = r'>'
311 t_PLUS = r'\+'
312 t_MINUS = r'-'
313 t_MULT = r'\*'
314 t_DIV = r'/'
315 t_MOD = r'%'
316 t_INVERT = r'¬'
317 t_COMMA = r','
318 t_SEMICOLON = r';'
319 t_APPEND = r'\|\|'
320 t_BITOR = r'\|'
321 t_BITAND = r'\&'
322 t_BITXOR = r'\^'
323
324 # Ply nicely documented how to do this.
325
326 RESERVED = {
327 "def": "DEF",
328 "if": "IF",
329 "then": "THEN",
330 "else": "ELSE",
331 "leave": "BREAK",
332 "for": "FOR",
333 "to": "TO",
334 "while": "WHILE",
335 "do": "DO",
336 "return": "RETURN",
337 "switch": "SWITCH",
338 "case": "CASE",
339 "default": "DEFAULT",
340 }
341
342 def t_NAME(self, t):
343 r'[a-zA-Z_][a-zA-Z0-9_]*'
344 t.type = self.RESERVED.get(t.value, "NAME")
345 return t
346
347 # Putting this before t_WS let it consume lines with only comments in
348 # them so the latter code never sees the WS part. Not consuming the
349 # newline. Needed for "if 1: #comment"
350 def t_comment(self, t):
351 r"[ ]*\043[^\n]*" # \043 is '#'
352 pass
353
354
355 # Whitespace
356 def t_WS(self, t):
357 r'[ ]+'
358 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
359 t.lexer.brack_count == 0:
360 return t
361
362 # Don't generate newline tokens when inside of parenthesis, eg
363 # a = (1,
364 # 2, 3)
365 def t_newline(self, t):
366 r'\n+'
367 t.lexer.lineno += len(t.value)
368 t.type = "NEWLINE"
369 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
370 return t
371
372 def t_LBRACK(self, t):
373 r'\['
374 t.lexer.brack_count += 1
375 return t
376
377 def t_RBRACK(self, t):
378 r'\]'
379 # check for underflow? should be the job of the parser
380 t.lexer.brack_count -= 1
381 return t
382
383 def t_LPAR(self, t):
384 r'\('
385 t.lexer.paren_count += 1
386 return t
387
388 def t_RPAR(self, t):
389 r'\)'
390 # check for underflow? should be the job of the parser
391 t.lexer.paren_count -= 1
392 return t
393
394 #t_ignore = " "
395
396 def t_error(self, t):
397 raise SyntaxError("Unknown symbol %r" % (t.value[0],))
398 print ("Skipping", repr(t.value[0]))
399 t.lexer.skip(1)
400
401
402 # Combine Ply and my filters into a new lexer
403
404 class IndentLexer(PowerLexer):
405 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
406 self.build(debug=debug, optimize=optimize,
407 lextab=lextab, reflags=reflags)
408 self.token_stream = None
409
410 def input(self, s, add_endmarker=True):
411 self.lexer.paren_count = 0
412 self.lexer.brack_count = 0
413 self.lexer.input(s)
414 self.token_stream = filter(self.lexer, add_endmarker)
415
416 def token(self):
417 try:
418 return next(self.token_stream)
419 except StopIteration:
420 return None
421
422 if __name__ == '__main__':
423
424 # quick test/demo
425 cnttzd = """
426 n <- 0
427 do while n < 64
428 if (RS)[63-n] = 0b1 then
429 leave
430 n <- n + 1
431 RA <- EXTZ64(n)
432 print (RA)
433 """
434
435 code = cnttzd
436
437 lexer = IndentLexer(debug=1)
438 # Give the lexer some input
439 print ("code")
440 print (code)
441 lexer.input(code)
442