-
-
Save dcolish/484731 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ---------------------------------------------------------------------- | |
# phplex.py | |
# | |
# A lexer for PHP. | |
# ---------------------------------------------------------------------- | |
import ply.lex as lex | |
# todo: literal html | |
# todo: double-quoted strings | |
# todo: number literals (LNUMBER, DNUMBER) | |
# todo: heredocs | |
# todo: backticks | |
# todo: namespaces | |
# todo: casts | |
# todo: "die" as alias for "exit" | |
# todo: BAD_CHARACTER | |
# todo: CURLY_OPEN, DOLLAR_OPEN_CURLY_BRACES, STRING_VARNAME | |
# todo: <script> syntax (does anyone use this?) | |
# todo: HALT_COMPILER (??) | |
# Reserved words | |
reserved = ( | |
'ARRAY', 'AS', 'BREAK', 'CASE', 'CLASS', 'CONST', 'CONTINUE', 'DECLARE', | |
'DEFAULT', 'DO', 'ECHO', 'ELSE', 'ELSEIF', 'EMPTY', 'ENDDECLARE', | |
'ENDFOR', 'ENDFOREACH', 'ENDIF', 'ENDSWITCH', 'ENDWHILE', 'EVAL', 'EXIT', | |
'EXTENDS', 'FOR', 'FOREACH', 'FUNCTION', 'GLOBAL', 'IF', 'INCLUDE', | |
'INCLUDE_ONCE', 'INSTANCEOF', 'ISSET', 'LIST', 'NEW', 'PRINT', 'REQUIRE', | |
'REQUIRE_ONCE', 'RETURN', 'STATIC', 'SWITCH', 'UNSET', 'USE', 'VAR', | |
'WHILE', 'FINAL', 'INTERFACE', 'IMPLEMENTS', 'PUBLIC', 'PRIVATE', | |
'PROTECTED', 'ABSTRACT', 'CLONE', 'TRY', 'CATCH', 'THROW', 'CFUNCTION', | |
'OLD_FUNCTION', | |
) | |
tokens = reserved + ( | |
# Generic | |
'WHITESPACE', 'OP', | |
# Operators | |
'SL', 'SR', 'BOOLEAN_OR', 'BOOLEAN_AND', 'IS_SMALLER_OR_EQUAL', | |
'IS_GREATER_OR_EQUAL', 'IS_EQUAL', 'IS_NOT_EQUAL', 'IS_IDENTICAL', | |
'IS_NOT_IDENTICAL', | |
# Assignment operators | |
'MUL_EQUAL', 'DIV_EQUAL', 'MOD_EQUAL', 'PLUS_EQUAL', 'MINUS_EQUAL', | |
'SL_EQUAL', 'SR_EQUAL', 'AND_EQUAL', 'OR_EQUAL', 'XOR_EQUAL', | |
'CONCAT_EQUAL', | |
# Increment/decrement | |
'INC', 'DEC', | |
# Arrows | |
'OBJECT_OPERATOR', 'DOUBLE_ARROW', 'DOUBLE_COLON', | |
# Comments | |
'COMMENT', 'DOC_COMMENT', | |
# Escaping from HTML | |
'OPEN_TAG', 'OPEN_TAG_WITH_ECHO', 'CLOSE_TAG' | |
# Identifiers and reserved words | |
'DIR', 'FILE', 'LINE', 'FUNC_C', 'CLASS_C', 'METHOD_C', 'NS_C', | |
'LOGICAL_AND', 'LOGICAL_OR', 'LOGICAL_XOR', | |
'STRING', 'VARIABLE', | |
'LNUMBER', 'DNUMBER', | |
'CONSTANT_ENCAPSED_STRING', | |
) | |
# Newlines | |
def t_WHITESPACE(t): | |
r'[ \t\r\n]+' | |
t.lexer.lineno += t.value.count("\n") | |
return t | |
# Assignment operators | |
def t_SL_EQUAL(t): r'<<='; return t | |
def t_SR_EQUAL(t): r'>>='; return t | |
def t_AND_EQUAL(t): r'&='; return t | |
def t_OR_EQUAL(t): r'\|='; return t | |
def t_XOR_EQUAL(t): r'\^='; return t | |
def t_MUL_EQUAL(t): r'\*='; return t | |
def t_DIV_EQUAL(t): r'/='; return t | |
def t_MOD_EQUAL(t): r'%='; return t | |
def t_PLUS_EQUAL(t): r'\+='; return t | |
def t_MINUS_EQUAL(t): r'-='; return t | |
def t_CONCAT_EQUAL(t): r'\.='; return t | |
# Operators | |
def t_SL(t): r'<<'; return t | |
def t_SR(t): r'>>'; return t | |
def t_BOOLEAN_AND(t): r'&&'; return t | |
def t_BOOLEAN_OR(t): r'\|\|'; return t | |
def t_IS_SMALLER_OR_EQUAL(t): r'<='; return t | |
def t_IS_GREATER_OR_EQUAL(t): r'>='; return t | |
def t_IS_IDENTICAL(t): r'==='; return t | |
def t_IS_NOT_IDENTICAL(t): r'!=='; return t | |
def t_IS_EQUAL(t): r'=='; return t | |
def t_IS_NOT_EQUAL(t): r'(!=)|(<>)'; return t | |
# Increment/decrement | |
def t_INC(t): r'\+\+'; return t | |
def t_DEC(t): r'--'; return t | |
# Arrows | |
def t_OBJECT_OPERATOR(t): r'->'; return t | |
def t_DOUBLE_ARROW(t): r'=>'; return t | |
def t_DOUBLE_COLON(t): r'::'; return t | |
# Comments | |
def t_DOC_COMMENT(t): | |
r'/\*\*(.|\n)*?\*/' | |
t.lexer.lineno += t.value.count("\n") | |
return t | |
def t_COMMENT(t): | |
r'(/\*(.|\n)*?\*/)|(//.*?\n)|(\#.*?\n)' | |
t.lexer.lineno += t.value.count("\n") | |
return t | |
# Escaping from HTML | |
def t_OPEN_TAG(t): | |
r'<[?%]((php)|=)?\n?' | |
if t.value.endswith('='): t.type = 'OPEN_TAG_WITH_ECHO' | |
t.lexer.lineno += t.value.count("\n") | |
return t | |
def t_CLOSE_TAG(t): | |
r'[?%]>\n?' | |
t.lexer.lineno += t.value.count("\n") | |
return t | |
# Identifiers and reserved words | |
reserved_map = { | |
'__DIR__': 'DIR', | |
'__FILE__': 'FILE', | |
'__LINE__': 'LINE', | |
'__FUNCTION__': 'FUNC_C', | |
'__CLASS__': 'CLASS_C', | |
'__METHOD__': 'METHOD_C', | |
'__NAMESPACE__': 'NS_C', | |
'AND': 'LOGICAL_AND', | |
'OR': 'LOGICAL_OR', | |
'XOR': 'LOGICAL_XOR', | |
} | |
for r in reserved: | |
reserved_map[r] = r | |
# Identifier | |
def t_STRING(t): | |
r'[A-Za-z_][\w_]*' | |
t.type = reserved_map.get(t.value.upper(), 'STRING') | |
return t | |
# Variable | |
def t_VARIABLE(t): | |
r'\$[A-Za-z_][\w_]*' | |
return t | |
# Integer literal (todo) | |
def t_LNUMBER(t): | |
r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?' | |
return t | |
# Floating literal (todo) | |
def t_DNUMBER(t): | |
r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' | |
return t | |
# String literal | |
def t_CONSTANT_ENCAPSED_STRING(t): | |
r'(\"([^\\\n]|(\\.))*?\")|(\'([^\\\n]|(\\.))*?\')' | |
return t | |
# Simple operator | |
def t_OP(t): | |
r'[\(\)\{\}\[\]+-/*%^&|~=<>.!,?:;@]' | |
t.type = 'OP' | |
return t | |
def t_error(t): | |
print("Illegal character %s" % repr(t.value[0])) | |
t.lexer.skip(1) | |
lexer = lex.lex(optimize=1) | |
if __name__ == "__main__": | |
lex.runmain(lexer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment