Last active
December 16, 2015 03:18
-
-
Save jaysoffian/5368435 to your computer and use it in GitHub Desktop.
Quick and dirty Objective-C tokenizer in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import deque, namedtuple | |
# Tokenize per the C preprocessor, more or less... | |
TOKENIZER = re.compile(r''' | |
(?P<WHITESPACE>\s+) | | |
(?P<COMMENT>(?://[^\n]* | /\*.*?\*/)) | | |
(?P<PREP>\#\s*[a-z][^\n]*) | | |
(?P<OBJC>@[a-z]+) | | |
(?P<NUMBER>(?:\.?\d(?:[eEpP][+-]|[a-zA-Z0-9_.])*)) | | |
(?P<STRING>[L@]?"(?:[^"\n\\]|\\.)*") | | |
(?P<CHAR>L?'(?:[^'\\]|\\.)') | | |
(?P<OP>(?: | |
@(?:YES|NO|[{(\[]) | ^{ | | |
\.\.\. | <<= | >>= | != | \#\# | %= | && | &= | \*= | | |
\+\+ | \+= | -- | -= | -> | /= | << | <= | == | | |
\|\| | >= | >> | ^= | \|= | | |
[][!#%&()*+,-./:;<=>?^{|}~] | |
)) | | |
(?P<IDENT>[A-Za-z_][A-Za-z0-9_]*) | | |
(?P<OTHER>.) | |
''', re.DOTALL | re.VERBOSE) | |
BRACKETS = { | |
'[':']', '{':'}', '(':')', '@[':']', '@{':'}', '@(':')', '^{':'}' | |
} | |
CLOSE_BRACKETS = BRACKETS.values() | |
Token = namedtuple("Token", "key val lnum") | |
def replace_trigraphs(text): | |
def repl(match): | |
return { | |
'(':'[', ')':']', '<':'{', '>':'}', | |
"'":'^', '!':'|', '-':'~', '=':'#', '/':'\\', | |
}[match.group(0)[-1]] | |
return re.sub(r"\?\?[()<>/'!-=]", repl, text) | |
def splice_lines(text): | |
line_iter = enumerate(text.splitlines(), 1) | |
lines = [] | |
line_nums = [] | |
for line_num, line in line_iter: | |
line_nums.append(line_num) | |
while line.endswith('\\'): | |
line = line[:-1] + line_iter.next()[1] | |
lines.append(line) | |
return '\n'.join(lines), line_nums | |
def tokenize(text): | |
text, line_nums = splice_lines(replace_trigraphs(text)) | |
num_nls = 0 | |
for match in re.finditer(TOKENIZER, text): | |
items = [(k, v) for (k, v) in match.groupdict().items() | |
if v is not None] | |
assert len(items) == 1 | |
key, val = items[0] | |
if key != 'WHITESPACE': | |
yield Token(key, val.replace('\\\n', ''), line_nums[num_nls]) | |
num_nls += val.count('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment