src/soc/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from soc.decoder.selectable_int import SelectableInt
  14
  15 ## I implemented INDENT / DEDENT generation as a post-processing filter
  16
  17 # The original lex token stream contains WS and NEWLINE characters.
  18 # WS will only occur before any other tokens on a line.
  19
  20 # I have three filters.  One tags tokens by adding two attributes.
  21 # "must_indent" is True if the token must be indented from the
  22 # previous code.  The other is "at_line_start" which is True for WS
  23 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  24 # see if the new line has changed indication level.
  25
  26 # Python's syntax has three INDENT states
  27 #  0) no colon hence no need to indent
  28 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  29 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  30 NO_INDENT = 0
  31 MAY_INDENT = 1
  32 MUST_INDENT = 2
  33
  34 # turn into python-like colon syntax from pseudo-code syntax.
  35 # identify tokens which tell us whether a "hidden colon" is needed.
  36 # this in turn means that track_tokens_filter "works" without needing
  37 # complex grammar rules
  38 def python_colonify(lexer, tokens):
  39
  40     fake_colon_needed = False
  41     for token in tokens:
  42         #print ("track colon token", token, token.type)
  43
  44         if token.type == 'THEN':
  45             # turn then into colon
  46             token.type = "COLON"
  47             yield token
  48         elif token.type == 'ELSE':
  49             yield token
  50             token = copy(token)
  51             token.type = "COLON"
  52             yield token
  53         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH', 'CASE', 'DEFAULT']:
  54             fake_colon_needed = True
  55             yield token
  56         elif token.type == 'NEWLINE':
  57             if fake_colon_needed:
  58                 ctok = copy(token)
  59                 ctok.type = "COLON"
  60                 yield ctok
  61                 fake_colon_needed = False
  62             yield token
  63         else:
  64             yield token
  65
  66
  67 # only care about whitespace at the start of a line
  68 def track_tokens_filter(lexer, tokens):
  69     oldignore = lexer.lexignore
  70     lexer.at_line_start = at_line_start = True
  71     indent = NO_INDENT
  72     saw_colon = False
  73     for token in tokens:
  74         #print ("track token", token, token.type)
  75         token.at_line_start = at_line_start
  76
  77         if token.type == "COLON":
  78             at_line_start = False
  79             indent = MAY_INDENT
  80             token.must_indent = False
  81
  82         elif token.type == "NEWLINE":
  83             at_line_start = True
  84             if indent == MAY_INDENT:
  85                 indent = MUST_INDENT
  86             token.must_indent = False
  87
  88         elif token.type == "WS":
  89             assert token.at_line_start == True
  90             at_line_start = True
  91             token.must_indent = False
  92
  93         else:
  94             # A real token; only indent after COLON NEWLINE
  95             if indent == MUST_INDENT:
  96                 token.must_indent = True
  97             else:
  98                 token.must_indent = False
  99             at_line_start = False
 100             indent = NO_INDENT
 101
 102         # really bad hack that changes ignore lexer state.
 103         # when "must indent" is seen (basically "real tokens" seen)
 104         # then ignore whitespace.
 105         if token.must_indent:
 106             lexer.lexignore = ('ignore', ' ')
 107         else:
 108             lexer.lexignore = oldignore
 109
 110         token.indent = indent
 111         yield token
 112         lexer.at_line_start = at_line_start
 113
 114 def _new_token(type, lineno):
 115     tok = lex.LexToken()
 116     tok.type = type
 117     tok.value = None
 118     tok.lineno = lineno
 119     tok.lexpos = -1
 120     return tok
 121
 122 # Synthesize a DEDENT tag
 123 def DEDENT(lineno):
 124     return _new_token("DEDENT", lineno)
 125
 126 # Synthesize an INDENT tag
 127 def INDENT(lineno):
 128     return _new_token("INDENT", lineno)
 129
 130
 131 # Track the indentation level and emit the right INDENT / DEDENT events.
 132 def indentation_filter(tokens):
 133     # A stack of indentation levels; will never pop item 0
 134     levels = [0]
 135     token = None
 136     depth = 0
 137     prev_was_ws = False
 138     for token in tokens:
 139         if 0:
 140             print ("Process", depth, token.indent, token,)
 141             if token.at_line_start:
 142                 print ("at_line_start",)
 143             if token.must_indent:
 144                 print ("must_indent",)
 145             print
 146
 147         # WS only occurs at the start of the line
 148         # There may be WS followed by NEWLINE so
 149         # only track the depth here.  Don't indent/dedent
 150         # until there's something real.
 151         if token.type == "WS":
 152             assert depth == 0
 153             depth = len(token.value)
 154             prev_was_ws = True
 155             # WS tokens are never passed to the parser
 156             continue
 157
 158         if token.type == "NEWLINE":
 159             depth = 0
 160             if prev_was_ws or token.at_line_start:
 161                 # ignore blank lines
 162                 continue
 163             # pass the other cases on through
 164             yield token
 165             continue
 166
 167         # then it must be a real token (not WS, not NEWLINE)
 168         # which can affect the indentation level
 169
 170         prev_was_ws = False
 171         if token.must_indent:
 172             # The current depth must be larger than the previous level
 173             if not (depth > levels[-1]):
 174                 raise IndentationError("expected an indented block")
 175
 176             levels.append(depth)
 177             yield INDENT(token.lineno)
 178
 179         elif token.at_line_start:
 180             # Must be on the same level or one of the previous levels
 181             if depth == levels[-1]:
 182                 # At the same level
 183                 pass
 184             elif depth > levels[-1]:
 185                 raise IndentationError("indent increase but not in new block")
 186             else:
 187                 # Back up; but only if it matches a previous level
 188                 try:
 189                     i = levels.index(depth)
 190                 except ValueError:
 191                     raise IndentationError("inconsistent indentation")
 192                 for _ in range(i+1, len(levels)):
 193                     yield DEDENT(token.lineno)
 194                     levels.pop()
 195
 196         yield token
 197
 198     ### Finished processing ###
 199
 200     # Must dedent any remaining levels
 201     if len(levels) > 1:
 202         assert token is not None
 203         for _ in range(1, len(levels)):
 204             yield DEDENT(token.lineno)
 205
 206
 207 # The top-level filter adds an ENDMARKER, if requested.
 208 # Python's grammar uses it.
 209 def filter(lexer, add_endmarker = True):
 210     token = None
 211     tokens = iter(lexer.token, None)
 212     tokens = python_colonify(lexer, tokens)
 213     tokens = track_tokens_filter(lexer, tokens)
 214     for token in indentation_filter(tokens):
 215         yield token
 216
 217     if add_endmarker:
 218         lineno = 1
 219         if token is not None:
 220             lineno = token.lineno
 221         yield _new_token("ENDMARKER", lineno)
 222
 223 ##### Lexer ######
 224
 225 class PowerLexer:
 226     tokens = (
 227         'DEF',
 228         'IF',
 229         'THEN',
 230         'ELSE',
 231         'FOR',
 232         'TO',
 233         'DO',
 234         'WHILE',
 235         'BREAK',
 236         'NAME',
 237         'NUMBER',  # Python decimals
 238         'BINARY',  # Python binary
 239         'STRING',  # single quoted strings only; syntax of raw strings
 240         'LPAR',
 241         'RPAR',
 242         'LBRACK',
 243         'RBRACK',
 244         'COLON',
 245         'EQ',
 246         'ASSIGNEA',
 247         'ASSIGN',
 248         'LTU',
 249         'GTU',
 250         'NE',
 251         'LE',
 252         'GE',
 253         'LT',
 254         'GT',
 255         'PLUS',
 256         'MINUS',
 257         'MULT',
 258         'DIV',
 259         'MOD',
 260         'INVERT',
 261         'APPEND',
 262         'BITOR',
 263         'BITAND',
 264         'BITXOR',
 265         'RETURN',
 266         'SWITCH',
 267         'CASE',
 268         'DEFAULT',
 269         'WS',
 270         'NEWLINE',
 271         'COMMA',
 272         'SEMICOLON',
 273         'INDENT',
 274         'DEDENT',
 275         'ENDMARKER',
 276         )
 277
 278     # Build the lexer
 279     def build(self,**kwargs):
 280          self.lexer = lex.lex(module=self, **kwargs)
 281
 282     def t_BINARY(self, t):
 283         r"""0b[01]+"""
 284         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 285         return t
 286
 287     #t_NUMBER = r'\d+'
 288     # taken from decmial.py but without the leading sign
 289     def t_NUMBER(self, t):
 290         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 291         t.value = int(t.value)
 292         return t
 293
 294     def t_STRING(self, t):
 295         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 296         print (repr(t.value))
 297         t.value=t.value[1:-1]
 298         return t
 299
 300     t_COLON = r':'
 301     t_EQ = r'='
 302     t_ASSIGNEA = r'<-iea'
 303     t_ASSIGN = r'<-'
 304     t_LTU = r'<u'
 305     t_GTU = r'>u'
 306     t_NE = r'!='
 307     t_LE = r'<='
 308     t_GE = r'>='
 309     t_LT = r'<'
 310     t_GT = r'>'
 311     t_PLUS = r'\+'
 312     t_MINUS = r'-'
 313     t_MULT = r'\*'
 314     t_DIV = r'/'
 315     t_MOD = r'%'
 316     t_INVERT = r'¬'
 317     t_COMMA = r','
 318     t_SEMICOLON = r';'
 319     t_APPEND = r'\|\|'
 320     t_BITOR = r'\|'
 321     t_BITAND = r'\&'
 322     t_BITXOR = r'\^'
 323
 324     # Ply nicely documented how to do this.
 325
 326     RESERVED = {
 327       "def": "DEF",
 328       "if": "IF",
 329       "then": "THEN",
 330       "else": "ELSE",
 331       "leave": "BREAK",
 332       "for": "FOR",
 333       "to": "TO",
 334       "while": "WHILE",
 335       "do": "DO",
 336       "return": "RETURN",
 337       "switch": "SWITCH",
 338       "case": "CASE",
 339       "default": "DEFAULT",
 340       }
 341
 342     def t_NAME(self, t):
 343         r'[a-zA-Z_][a-zA-Z0-9_]*'
 344         t.type = self.RESERVED.get(t.value, "NAME")
 345         return t
 346
 347     # Putting this before t_WS let it consume lines with only comments in
 348     # them so the latter code never sees the WS part.  Not consuming the
 349     # newline.  Needed for "if 1: #comment"
 350     def t_comment(self, t):
 351         r"[ ]*\043[^\n]*"  # \043 is '#'
 352         pass
 353
 354
 355     # Whitespace
 356     def t_WS(self, t):
 357         r'[ ]+'
 358         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 359                                      t.lexer.brack_count == 0:
 360             return t
 361
 362     # Don't generate newline tokens when inside of parenthesis, eg
 363     #   a = (1,
 364     #        2, 3)
 365     def t_newline(self, t):
 366         r'\n+'
 367         t.lexer.lineno += len(t.value)
 368         t.type = "NEWLINE"
 369         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 370             return t
 371
 372     def t_LBRACK(self, t):
 373         r'\['
 374         t.lexer.brack_count += 1
 375         return t
 376
 377     def t_RBRACK(self, t):
 378         r'\]'
 379         # check for underflow?  should be the job of the parser
 380         t.lexer.brack_count -= 1
 381         return t
 382
 383     def t_LPAR(self, t):
 384         r'\('
 385         t.lexer.paren_count += 1
 386         return t
 387
 388     def t_RPAR(self, t):
 389         r'\)'
 390         # check for underflow?  should be the job of the parser
 391         t.lexer.paren_count -= 1
 392         return t
 393
 394     #t_ignore = " "
 395
 396     def t_error(self, t):
 397         raise SyntaxError("Unknown symbol %r" % (t.value[0],))
 398         print ("Skipping", repr(t.value[0]))
 399         t.lexer.skip(1)
 400
 401
 402 # Combine Ply and my filters into a new lexer
 403
 404 class IndentLexer(PowerLexer):
 405     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 406         self.build(debug=debug, optimize=optimize,
 407                                 lextab=lextab, reflags=reflags)
 408         self.token_stream = None
 409
 410     def input(self, s, add_endmarker=True):
 411         self.lexer.paren_count = 0
 412         self.lexer.brack_count = 0
 413         self.lexer.input(s)
 414         self.token_stream = filter(self.lexer, add_endmarker)
 415
 416     def token(self):
 417         try:
 418             return next(self.token_stream)
 419         except StopIteration:
 420             return None
 421
 422 if __name__ == '__main__':
 423
 424     # quick test/demo
 425     cnttzd = """
 426     n  <- 0
 427     do while n < 64
 428        if (RS)[63-n] = 0b1 then
 429             leave
 430        n  <- n + 1
 431     RA <- EXTZ64(n)
 432     print (RA)
 433     """
 434
 435     code = cnttzd
 436
 437     lexer = IndentLexer(debug=1)
 438     # Give the lexer some input
 439     print ("code")
 440     print (code)
 441     lexer.input(code)
 442