Source code for uppaal2jetracer.parser.declarationparser.declaration_lexer

"""
This module provides a lexer for the UPPAAL declaration language. It leverages
the PLY (Python Lex-Yacc) library to tokenize input strings according to the 
specific syntax rules of UPPAAL declarations. The lexer is implemented in the 
`DeclarationLexer` class, which includes support for handling custom errors, 
tracking file-related metadata, and identifying specific tokens such as 
keywords, identifiers, constants, operators, and string literals.

Based on the c_lexer.py from the pycparser project by Eli Bendersky (https://eli.thegreenplace.net)
under the BSD license.

Classes:
    * DeclarationLexer: Implements a custom lexer for UPPAAL declarations with methods for input handling, token generation, and error handling.

Key Features:
    * Recognizes reserved keywords, custom identifiers, constants, and various operators.
    * Provides hooks for handling specific events like opening and closing braces.
    * Offers mechanisms to handle typographical lookups for typedef names.

Usage:
    This lexer can be used to tokenize UPPAAL declaration files by creating an 
    instance of `DeclarationLexer`, providing necessary callback functions, and 
    calling its `build`, `input`, and `token` methods as needed.
"""

import re

from uppaal2jetracer.parser.declarationparser.ply import lex
from uppaal2jetracer.parser.declarationparser.ply.lex import TOKEN


[docs] class DeclarationLexer(): """ A lexer for the UPPAAL declaration language. After building it, set the input text with input(), and call token() to get new tokens. The public attribute filename can be set to an initial filename, but the lexer will update it upon #line directives. :ivar error_func: An error function. Will be called with an error message, line, and column as arguments, in case of an error during lexing. :type error_func: Callable[[str, int, int], None] :ivar on_lbrace_func: Function triggered when an opening brace '{' is encountered. :type on_lbrace_func: Callable[[], None] :ivar on_rbrace_func: Function triggered when a closing brace '}' is encountered. :type on_rbrace_func: Callable[[], None] :ivar type_lookup_func: A type lookup function. Given a string, it must return True if this string is a name of a type that was defined with a typedef earlier. :type type_lookup_func: Callable[[str], bool] :ivar filename: Tracks the filename, updating it upon #line directives. :type filename: str :ivar last_token: Keeps track of the last token returned from self.token(). :type last_token: Any :ivar line_pattern: A compiled regular expression for matching "# line" or "# <num>" patterns. :type line_pattern: re.Pattern :ivar pragma_pattern: A compiled regular expression for matching "# pragma" patterns. :type pragma_pattern: re.Pattern :ivar lexer: The PLY lexer object created after calling the build() method. :type lexer: ply.lex.Lexer """ def __init__(self, error_func, on_lbrace_func, on_rbrace_func, type_lookup_func): """ Create a new Lexer. error_func: An error function. Will be called with an error message, line and column as arguments, in case of an error during lexing. on_lbrace_func, on_rbrace_func: Called when an LBRACE or RBRACE is encountered (likely to push/pop type_lookup_func's scope) type_lookup_func: A type lookup function. Given a string, it must return True IFF this string is a name of a type that was defined with a typedef earlier. """ self.error_func = error_func self.on_lbrace_func = on_lbrace_func self.on_rbrace_func = on_rbrace_func self.type_lookup_func = type_lookup_func self.filename = "" # Keeps track of the last token returned from self.token() self.last_token = None
[docs] def build(self, **kwargs): """ Builds the lexer from the specification. Must be called after the lexer object is created. This method exists separately, because the PLY manual warns against calling lex.lex inside __init__ """ self.lexer = lex.lex(object = self, **kwargs)
[docs] def reset_lineno(self): """ Resets the internal line number counter of the lexer. """ self.lexer.lineno = 1
[docs] def input(self, text): self.lexer.input(text)
[docs] def token(self): self.last_token = self.lexer.token() return self.last_token
[docs] def find_tok_column(self, token): """ Find the column of the token in its line. """ last_cr = self.lexer.lexdata.rfind("\n", 0, token.lexpos) return token.lexpos - last_cr
######################-- PRIVATE --###################### ## ## Internal auxiliary methods ## def _error(self, msg, token): location = self._make_tok_location(token) self.error_func(msg, location[0], location[1]) self.lexer.skip(1) def _make_tok_location(self, token): return token.lineno, self.find_tok_column(token) ## ## Reserved keywords ## keywords = ( "BOOL", "BROADCAST", "CLOCK", "CHAN", "CHAR", "STRING", "CONST", "DO", "DOUBLE", "ELSE", "FOR", "IF", "INT", "RETURN", "STRUCT", "TYPEDEF", "VOID", "WHILE", ) keyword_map = {} for keyword in keywords: keyword_map[keyword.lower()] = keyword ## ## All the tokens recognized by the lexer ## tokens = keywords + ( # Identifiers "ID", # Type identifiers (identifiers previously defined as types with typedef) "TYPEID", # constants "INT_CONST_DEC", "DOUBLE_CONST", "CHAR_CONST", "BOOL_CONST", # String literal "STRING_LITERAL", # Operators "PLUS", "MINUS", "TIMES", "DIVIDE", "MOD", "OR", "AND", "NOT", "XOR", "LSHIFT", "RSHIFT", "LOR", "LAND", "LNOT", "LT", "LE", "GT", "GE", "EQ", "NE", # Assignment "EQUALS", "TIMESEQUAL", "DIVEQUAL", "MODEQUAL", "PLUSEQUAL", "MINUSEQUAL", "LSHIFTEQUAL", "RSHIFTEQUAL", "ANDEQUAL", "XOREQUAL", "OREQUAL", # Structure dereference (->) "ARROW", # Delimiters # ( ) "LPAREN", "RPAREN", # [ ] "LBRACKET", "RBRACKET", # { } "LBRACE", "RBRACE", # . , "COMMA", "PERIOD", # ; : "SEMI", "COLON", ) ## ## Regexes for use in tokens ## # valid identifiers IDENTIFIER = r"[a-zA-Z_][0-9a-zA-Z_]*" # integer constants decimal_constant = "(0)|([1-9][0-9]*)" # character constants # The following modifications were made to avoid the ambiguity that allowed backtracking: # (https://github.com/eliben/pycparser/issues/61) # # - decimal_escape allows one or more decimal characters, # but requires that the next character (if any) is not a decimal # - bad_escape does not allow any decimals (8-9), # to avoid conflicting with the permissive decimal_escape. # # Without this change, python's `re` module would recursively try parsing # each ambiguous escape sequence in multiple ways. # e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. SIMPLE_ESCAPE = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" DECIMAL_ESCAPE = r"""(\d+)(?!\d)""" BAD_ESCAPE = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" escape_sequence = fr"""(\\({SIMPLE_ESCAPE}|{DECIMAL_ESCAPE}))""" # This complex regex with lookahead can be slow for strings. # To optimize, valid escapes (including \x) that allow zero or more non-escaped characters # after the first character were simplified into a combination of # simple_escape and decimal_escape. ESCAPE_SEQUENCE_START_IN_STRING = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" CCONST_CHAR = fr"""([^'\\\n]|{escape_sequence})""" char_const = f"'{CCONST_CHAR}'" unmatched_quote = f"('{CCONST_CHAR}*\\n)|('{CCONST_CHAR}*$)" BAD_CHAR_CONST = fr"""('{CCONST_CHAR}[^'\n]+')|('')|('{BAD_ESCAPE}[^'\n]*')""" # bool constants BOOL_CONST = r"""(true|false)""" # string literals STRING_CHAR = fr"""([^"\\\n]|{ESCAPE_SEQUENCE_START_IN_STRING})""" string_literal = f'"{STRING_CHAR}*"' bad_string_literal = f'"{STRING_CHAR}*{BAD_ESCAPE}{STRING_CHAR}*"' # double constants EXPONENT_PART = r"""([eE][-+]?[0-9]+)""" FRACTIONAL_CONSTANT = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" double_constant = (f"(((( {FRACTIONAL_CONSTANT} ){EXPONENT_PART}?)" f"|([0-9]+{EXPONENT_PART}))[FfLl]?)") t_ignore = " \t" # Newlines
[docs] def t_NEWLINE(self, t): r"""\n+""" t.lexer.lineno += t.value.count("\n")
# Operators t_PLUS = r"\+" t_MINUS = r"-" t_TIMES = r"\*" t_DIVIDE = r"/" t_MOD = r"%" t_OR = r"\|" t_AND = r"&" t_NOT = r"~" t_XOR = r"\^" t_LSHIFT = r"<<" t_RSHIFT = r">>" t_LOR = r"\|\|" t_LAND = r"&&" t_LNOT = r"!" t_LT = r"<" t_GT = r">" t_LE = r"<=" t_GE = r">=" t_EQ = r"==" t_NE = r"!=" # Assignment operators t_EQUALS = r"=" t_TIMESEQUAL = r"\*=" t_DIVEQUAL = r"/=" t_MODEQUAL = r"%=" t_PLUSEQUAL = r"\+=" t_MINUSEQUAL = r"-=" t_LSHIFTEQUAL = r"<<=" t_RSHIFTEQUAL = r">>=" t_ANDEQUAL = r"&=" t_OREQUAL = r"\|=" t_XOREQUAL = r"\^=" # -> t_ARROW = r"->" # Delimiters t_LPAREN = r"\(" t_RPAREN = r"\)" t_LBRACKET = r"\[" t_RBRACKET = r"\]" t_COMMA = r"," t_PERIOD = r"\." t_SEMI = r";" t_COLON = r":" # Scope delimiters # To see why on_lbrace_func is needed, consider: # typedef char TT; # void foo(int TT) { TT = 10; } # TT x = 5; # Outside the function, TT is a typedef, but inside (starting and ending # with the braces) it's a parameter. The trouble begins with yacc's # lookahead token. If we open a new scope in brace_open, then TT has # already been read and incorrectly interpreted as TYPEID. So, we need # to open and close scopes from within the lexer. # Similar for the TT immediately outside the end of the function.
[docs] @TOKEN(r"\{") def t_LBRACE(self, t): """ Handles the opening brace "{" token. Triggers the on_lbrace_func to update the scope. """ self.on_lbrace_func() return t
[docs] @TOKEN(r"\}") def t_RBRACE(self, t): """ Handles the closing brace "}" token. Triggers the on_rbrace_func to update the scope. """ self.on_rbrace_func() return t
t_STRING_LITERAL = string_literal # The following floating and integer constants are defined as # functions to impose a strict order (otherwise, decimal # is placed before the others because its regex is longer, # and this is bad)
[docs] @TOKEN(double_constant) def t_DOUBLE_CONST(self, t): """ Matches and returns a floating-point constant token. """ return t
[docs] @TOKEN(decimal_constant) def t_INT_CONST_DEC(self, t): """ Matches and returns a decimal integer constant token. """ return t
[docs] @TOKEN(char_const) def t_CHAR_CONST(self, t): """ Matches and returns a regular character constant token. """ return t
[docs] @TOKEN(BOOL_CONST) def t_BOOL_CONST(self, t): """ Matches and returns a boolean constant token. """ return t
[docs] @TOKEN(unmatched_quote) def t_UNMATCHED_QUOTE(self, t): """ Handles unmatched single-quote characters by raising an error. """ msg = "Unmatched '" self._error(msg, t)
[docs] @TOKEN(BAD_CHAR_CONST) def t_BAD_CHAR_CONST(self, t): """ Detects invalid character constants and raises an error. """ msg = f"Invalid char constant {t.value}" self._error(msg, t)
[docs] @TOKEN(bad_string_literal) def t_BAD_STRING_LITERAL(self, t): """ Matches invalid string literals that contain invalid escape codes and raises an error. """ msg = "String contains invalid escape code" self._error(msg, t)
[docs] @TOKEN(IDENTIFIER) def t_ID(self, t): """ Matches identifiers and checks if they are keywords or type identifiers. Updates the token type accordingly. """ t.type = self.keyword_map.get(t.value, "ID") if t.type == "ID" and self.type_lookup_func(t.value): t.type = "TYPEID" return t
[docs] def t_error(self, t): """ Handles illegal character errors encountered during lexical analysis. """ msg = f"Illegal character {repr(t.value[0])}" self._error(msg, t)