Spaces:
Running
Running
text-generation-webui
/
installer_files
/conda
/lib
/python3.10
/site-packages
/pycparser
/c_lexer.py
#------------------------------------------------------------------------------ | |
# pycparser: c_lexer.py | |
# | |
# CLexer class: lexer for the C language | |
# | |
# Eli Bendersky [https://eli.thegreenplace.net/] | |
# License: BSD | |
#------------------------------------------------------------------------------ | |
import re | |
from .ply import lex | |
from .ply.lex import TOKEN | |
class CLexer(object): | |
""" A lexer for the C language. After building it, set the | |
input text with input(), and call token() to get new | |
tokens. | |
The public attribute filename can be set to an initial | |
filename, but the lexer will update it upon #line | |
directives. | |
""" | |
def __init__(self, error_func, on_lbrace_func, on_rbrace_func, | |
type_lookup_func): | |
""" Create a new Lexer. | |
error_func: | |
An error function. Will be called with an error | |
message, line and column as arguments, in case of | |
an error during lexing. | |
on_lbrace_func, on_rbrace_func: | |
Called when an LBRACE or RBRACE is encountered | |
(likely to push/pop type_lookup_func's scope) | |
type_lookup_func: | |
A type lookup function. Given a string, it must | |
return True IFF this string is a name of a type | |
that was defined with a typedef earlier. | |
""" | |
self.error_func = error_func | |
self.on_lbrace_func = on_lbrace_func | |
self.on_rbrace_func = on_rbrace_func | |
self.type_lookup_func = type_lookup_func | |
self.filename = '' | |
# Keeps track of the last token returned from self.token() | |
self.last_token = None | |
# Allow either "# line" or "# <num>" to support GCC's | |
# cpp output | |
# | |
self.line_pattern = re.compile(r'([ \t]*line\W)|([ \t]*\d+)') | |
self.pragma_pattern = re.compile(r'[ \t]*pragma\W') | |
def build(self, **kwargs): | |
""" Builds the lexer from the specification. Must be | |
called after the lexer object is created. | |
This method exists separately, because the PLY | |
manual warns against calling lex.lex inside | |
__init__ | |
""" | |
self.lexer = lex.lex(object=self, **kwargs) | |
def reset_lineno(self): | |
""" Resets the internal line number counter of the lexer. | |
""" | |
self.lexer.lineno = 1 | |
def input(self, text): | |
self.lexer.input(text) | |
def token(self): | |
self.last_token = self.lexer.token() | |
return self.last_token | |
def find_tok_column(self, token): | |
""" Find the column of the token in its line. | |
""" | |
last_cr = self.lexer.lexdata.rfind('\n', 0, token.lexpos) | |
return token.lexpos - last_cr | |
######################-- PRIVATE --###################### | |
## | |
## Internal auxiliary methods | |
## | |
def _error(self, msg, token): | |
location = self._make_tok_location(token) | |
self.error_func(msg, location[0], location[1]) | |
self.lexer.skip(1) | |
def _make_tok_location(self, token): | |
return (token.lineno, self.find_tok_column(token)) | |
## | |
## Reserved keywords | |
## | |
keywords = ( | |
'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', | |
'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', | |
'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', | |
'REGISTER', 'OFFSETOF', | |
'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', | |
'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', | |
'VOLATILE', 'WHILE', '__INT128', | |
) | |
keywords_new = ( | |
'_BOOL', '_COMPLEX', | |
'_NORETURN', '_THREAD_LOCAL', '_STATIC_ASSERT', | |
'_ATOMIC', '_ALIGNOF', '_ALIGNAS', | |
) | |
keyword_map = {} | |
for keyword in keywords: | |
keyword_map[keyword.lower()] = keyword | |
for keyword in keywords_new: | |
keyword_map[keyword[:2].upper() + keyword[2:].lower()] = keyword | |
## | |
## All the tokens recognized by the lexer | |
## | |
tokens = keywords + keywords_new + ( | |
# Identifiers | |
'ID', | |
# Type identifiers (identifiers previously defined as | |
# types with typedef) | |
'TYPEID', | |
# constants | |
'INT_CONST_DEC', 'INT_CONST_OCT', 'INT_CONST_HEX', 'INT_CONST_BIN', 'INT_CONST_CHAR', | |
'FLOAT_CONST', 'HEX_FLOAT_CONST', | |
'CHAR_CONST', | |
'WCHAR_CONST', | |
'U8CHAR_CONST', | |
'U16CHAR_CONST', | |
'U32CHAR_CONST', | |
# String literals | |
'STRING_LITERAL', | |
'WSTRING_LITERAL', | |
'U8STRING_LITERAL', | |
'U16STRING_LITERAL', | |
'U32STRING_LITERAL', | |
# Operators | |
'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD', | |
'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT', | |
'LOR', 'LAND', 'LNOT', | |
'LT', 'LE', 'GT', 'GE', 'EQ', 'NE', | |
# Assignment | |
'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', | |
'PLUSEQUAL', 'MINUSEQUAL', | |
'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', | |
'OREQUAL', | |
# Increment/decrement | |
'PLUSPLUS', 'MINUSMINUS', | |
# Structure dereference (->) | |
'ARROW', | |
# Conditional operator (?) | |
'CONDOP', | |
# Delimiters | |
'LPAREN', 'RPAREN', # ( ) | |
'LBRACKET', 'RBRACKET', # [ ] | |
'LBRACE', 'RBRACE', # { } | |
'COMMA', 'PERIOD', # . , | |
'SEMI', 'COLON', # ; : | |
# Ellipsis (...) | |
'ELLIPSIS', | |
# pre-processor | |
'PPHASH', # '#' | |
'PPPRAGMA', # 'pragma' | |
'PPPRAGMASTR', | |
) | |
## | |
## Regexes for use in tokens | |
## | |
## | |
# valid C identifiers (K&R2: A.2.3), plus '$' (supported by some compilers) | |
identifier = r'[a-zA-Z_$][0-9a-zA-Z_$]*' | |
hex_prefix = '0[xX]' | |
hex_digits = '[0-9a-fA-F]+' | |
bin_prefix = '0[bB]' | |
bin_digits = '[01]+' | |
# integer constants (K&R2: A.2.5.1) | |
integer_suffix_opt = r'(([uU]ll)|([uU]LL)|(ll[uU]?)|(LL[uU]?)|([uU][lL])|([lL][uU]?)|[uU])?' | |
decimal_constant = '(0'+integer_suffix_opt+')|([1-9][0-9]*'+integer_suffix_opt+')' | |
octal_constant = '0[0-7]*'+integer_suffix_opt | |
hex_constant = hex_prefix+hex_digits+integer_suffix_opt | |
bin_constant = bin_prefix+bin_digits+integer_suffix_opt | |
bad_octal_constant = '0[0-7]*[89]' | |
# character constants (K&R2: A.2.5.2) | |
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line | |
# directives with Windows paths as filenames (..\..\dir\file) | |
# For the same reason, decimal_escape allows all digit sequences. We want to | |
# parse all correct code, even if it means to sometimes parse incorrect | |
# code. | |
# | |
# The original regexes were taken verbatim from the C syntax definition, | |
# and were later modified to avoid worst-case exponential running time. | |
# | |
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" | |
# decimal_escape = r"""(\d+)""" | |
# hex_escape = r"""(x[0-9a-fA-F]+)""" | |
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" | |
# | |
# The following modifications were made to avoid the ambiguity that allowed backtracking: | |
# (https://github.com/eliben/pycparser/issues/61) | |
# | |
# - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape. | |
# - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex | |
# - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal | |
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape. | |
# | |
# Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways. | |
# e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`. | |
simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))""" | |
decimal_escape = r"""(\d+)(?!\d)""" | |
hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])""" | |
bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])""" | |
escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' | |
# This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed | |
# 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to | |
escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])""" | |
cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' | |
char_const = "'"+cconst_char+"'" | |
wchar_const = 'L'+char_const | |
u8char_const = 'u8'+char_const | |
u16char_const = 'u'+char_const | |
u32char_const = 'U'+char_const | |
multicharacter_constant = "'"+cconst_char+"{2,4}'" | |
unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" | |
bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')""" | |
# string literals (K&R2: A.2.6) | |
string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')' | |
string_literal = '"'+string_char+'*"' | |
wstring_literal = 'L'+string_literal | |
u8string_literal = 'u8'+string_literal | |
u16string_literal = 'u'+string_literal | |
u32string_literal = 'U'+string_literal | |
bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' | |
# floating constants (K&R2: A.2.5.3) | |
exponent_part = r"""([eE][-+]?[0-9]+)""" | |
fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" | |
floating_constant = '(((('+fractional_constant+')'+exponent_part+'?)|([0-9]+'+exponent_part+'))[FfLl]?)' | |
binary_exponent_part = r'''([pP][+-]?[0-9]+)''' | |
hex_fractional_constant = '((('+hex_digits+r""")?\."""+hex_digits+')|('+hex_digits+r"""\.))""" | |
hex_floating_constant = '('+hex_prefix+'('+hex_digits+'|'+hex_fractional_constant+')'+binary_exponent_part+'[FfLl]?)' | |
## | |
## Lexer states: used for preprocessor \n-terminated directives | |
## | |
states = ( | |
# ppline: preprocessor line directives | |
# | |
('ppline', 'exclusive'), | |
# pppragma: pragma | |
# | |
('pppragma', 'exclusive'), | |
) | |
def t_PPHASH(self, t): | |
r'[ \t]*\#' | |
if self.line_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): | |
t.lexer.begin('ppline') | |
self.pp_line = self.pp_filename = None | |
elif self.pragma_pattern.match(t.lexer.lexdata, pos=t.lexer.lexpos): | |
t.lexer.begin('pppragma') | |
else: | |
t.type = 'PPHASH' | |
return t | |
## | |
## Rules for the ppline state | |
## | |
def t_ppline_FILENAME(self, t): | |
if self.pp_line is None: | |
self._error('filename before line number in #line', t) | |
else: | |
self.pp_filename = t.value.lstrip('"').rstrip('"') | |
def t_ppline_LINE_NUMBER(self, t): | |
if self.pp_line is None: | |
self.pp_line = t.value | |
else: | |
# Ignore: GCC's cpp sometimes inserts a numeric flag | |
# after the file name | |
pass | |
def t_ppline_NEWLINE(self, t): | |
r'\n' | |
if self.pp_line is None: | |
self._error('line number missing in #line', t) | |
else: | |
self.lexer.lineno = int(self.pp_line) | |
if self.pp_filename is not None: | |
self.filename = self.pp_filename | |
t.lexer.begin('INITIAL') | |
def t_ppline_PPLINE(self, t): | |
r'line' | |
pass | |
t_ppline_ignore = ' \t' | |
def t_ppline_error(self, t): | |
self._error('invalid #line directive', t) | |
## | |
## Rules for the pppragma state | |
## | |
def t_pppragma_NEWLINE(self, t): | |
r'\n' | |
t.lexer.lineno += 1 | |
t.lexer.begin('INITIAL') | |
def t_pppragma_PPPRAGMA(self, t): | |
r'pragma' | |
return t | |
t_pppragma_ignore = ' \t' | |
def t_pppragma_STR(self, t): | |
'.+' | |
t.type = 'PPPRAGMASTR' | |
return t | |
def t_pppragma_error(self, t): | |
self._error('invalid #pragma directive', t) | |
## | |
## Rules for the normal state | |
## | |
t_ignore = ' \t' | |
# Newlines | |
def t_NEWLINE(self, t): | |
r'\n+' | |
t.lexer.lineno += t.value.count("\n") | |
# Operators | |
t_PLUS = r'\+' | |
t_MINUS = r'-' | |
t_TIMES = r'\*' | |
t_DIVIDE = r'/' | |
t_MOD = r'%' | |
t_OR = r'\|' | |
t_AND = r'&' | |
t_NOT = r'~' | |
t_XOR = r'\^' | |
t_LSHIFT = r'<<' | |
t_RSHIFT = r'>>' | |
t_LOR = r'\|\|' | |
t_LAND = r'&&' | |
t_LNOT = r'!' | |
t_LT = r'<' | |
t_GT = r'>' | |
t_LE = r'<=' | |
t_GE = r'>=' | |
t_EQ = r'==' | |
t_NE = r'!=' | |
# Assignment operators | |
t_EQUALS = r'=' | |
t_TIMESEQUAL = r'\*=' | |
t_DIVEQUAL = r'/=' | |
t_MODEQUAL = r'%=' | |
t_PLUSEQUAL = r'\+=' | |
t_MINUSEQUAL = r'-=' | |
t_LSHIFTEQUAL = r'<<=' | |
t_RSHIFTEQUAL = r'>>=' | |
t_ANDEQUAL = r'&=' | |
t_OREQUAL = r'\|=' | |
t_XOREQUAL = r'\^=' | |
# Increment/decrement | |
t_PLUSPLUS = r'\+\+' | |
t_MINUSMINUS = r'--' | |
# -> | |
t_ARROW = r'->' | |
# ? | |
t_CONDOP = r'\?' | |
# Delimiters | |
t_LPAREN = r'\(' | |
t_RPAREN = r'\)' | |
t_LBRACKET = r'\[' | |
t_RBRACKET = r'\]' | |
t_COMMA = r',' | |
t_PERIOD = r'\.' | |
t_SEMI = r';' | |
t_COLON = r':' | |
t_ELLIPSIS = r'\.\.\.' | |
# Scope delimiters | |
# To see why on_lbrace_func is needed, consider: | |
# typedef char TT; | |
# void foo(int TT) { TT = 10; } | |
# TT x = 5; | |
# Outside the function, TT is a typedef, but inside (starting and ending | |
# with the braces) it's a parameter. The trouble begins with yacc's | |
# lookahead token. If we open a new scope in brace_open, then TT has | |
# already been read and incorrectly interpreted as TYPEID. So, we need | |
# to open and close scopes from within the lexer. | |
# Similar for the TT immediately outside the end of the function. | |
# | |
def t_LBRACE(self, t): | |
self.on_lbrace_func() | |
return t | |
def t_RBRACE(self, t): | |
self.on_rbrace_func() | |
return t | |
t_STRING_LITERAL = string_literal | |
# The following floating and integer constants are defined as | |
# functions to impose a strict order (otherwise, decimal | |
# is placed before the others because its regex is longer, | |
# and this is bad) | |
# | |
def t_FLOAT_CONST(self, t): | |
return t | |
def t_HEX_FLOAT_CONST(self, t): | |
return t | |
def t_INT_CONST_HEX(self, t): | |
return t | |
def t_INT_CONST_BIN(self, t): | |
return t | |
def t_BAD_CONST_OCT(self, t): | |
msg = "Invalid octal constant" | |
self._error(msg, t) | |
def t_INT_CONST_OCT(self, t): | |
return t | |
def t_INT_CONST_DEC(self, t): | |
return t | |
# Must come before bad_char_const, to prevent it from | |
# catching valid char constants as invalid | |
# | |
def t_INT_CONST_CHAR(self, t): | |
return t | |
def t_CHAR_CONST(self, t): | |
return t | |
def t_WCHAR_CONST(self, t): | |
return t | |
def t_U8CHAR_CONST(self, t): | |
return t | |
def t_U16CHAR_CONST(self, t): | |
return t | |
def t_U32CHAR_CONST(self, t): | |
return t | |
def t_UNMATCHED_QUOTE(self, t): | |
msg = "Unmatched '" | |
self._error(msg, t) | |
def t_BAD_CHAR_CONST(self, t): | |
msg = "Invalid char constant %s" % t.value | |
self._error(msg, t) | |
def t_WSTRING_LITERAL(self, t): | |
return t | |
def t_U8STRING_LITERAL(self, t): | |
return t | |
def t_U16STRING_LITERAL(self, t): | |
return t | |
def t_U32STRING_LITERAL(self, t): | |
return t | |
# unmatched string literals are caught by the preprocessor | |
def t_BAD_STRING_LITERAL(self, t): | |
msg = "String contains invalid escape code" | |
self._error(msg, t) | |
def t_ID(self, t): | |
t.type = self.keyword_map.get(t.value, "ID") | |
if t.type == 'ID' and self.type_lookup_func(t.value): | |
t.type = "TYPEID" | |
return t | |
def t_error(self, t): | |
msg = 'Illegal character %s' % repr(t.value[0]) | |
self._error(msg, t) | |