Last active
June 28, 2017 08:19
-
-
Save dvirsky/021992fdf155ec6b93b98d1deefa1e12 to your computer and use it in GitHub Desktop.
Pre-tokenize source code for searching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import itertools | |
import sys | |
def snake_case_split(ident): | |
""" | |
Split a snake case identifier into words, returning the original ident and its splits as a list | |
""" | |
splits = filter(None, re.split('_', ident)) | |
if len(splits) <= 1: | |
return [ident] | |
splits.append(ident) | |
return splits | |
def camel_case_split(ident): | |
""" | |
Split a camel-case identifier into words, returning the original ident and its splits | |
""" | |
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', ident) | |
splits = [m.group(0) for m in matches] | |
if len(splits) <= 1: | |
return [ident] | |
splits.append(ident) | |
return splits | |
def tokenize_code(fileName): | |
""" | |
Open a file name and return it's expanded tokenized version by removing non alphanumeric stuff, | |
and splitting camel/snake case | |
""" | |
with open(fileName) as f: | |
lines = [] | |
for line in f: | |
toks = filter(None, re.split('\W', line )) | |
if not toks: | |
continue | |
toks = itertools.chain(*(camel_case_split(x) for x in toks)) | |
toks = itertools.chain(*(snake_case_split(x) for x in toks)) | |
lines.append(toks) | |
return lines | |
if __name__ == '__main__': | |
print('\n'.join((' '.join(line) for line in tokenize_code(sys.argv[1])))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import itertools | |
import sys | |
def snake case split snake_case_split ident | |
Split a snake case identifier into words returning the original ident and its splits as a list | |
splits filter None re split _ ident | |
if len splits 1 | |
return ident | |
splits append ident | |
return splits | |
def camel case split camel_case_split ident | |
Split a camel case identifier into words returning the original ident and its splits | |
matches re finditer a z A Z A Z A Z a z ident | |
splits m group 0 for m in matches | |
if len splits 1 | |
return ident | |
splits append ident | |
return splits | |
def tokenize code tokenize_code file Name fileName | |
Open a file name and return it s expanded tokenized version by removing non alphanumeric stuff | |
and splitting camel snake case | |
with open file Name fileName as f | |
lines | |
for line in f | |
toks filter None re split W line | |
if not toks | |
continue | |
toks itertools chain camel case split camel_case_split x for x in toks | |
toks itertools chain snake case split snake_case_split x for x in toks | |
lines append toks | |
return lines | |
if __name__ __main__ | |
print n join join line for line in tokenize code tokenize_code sys argv 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment