Skip to content

Instantly share code, notes, and snippets.

@dvirsky
Last active June 28, 2017 08:19
Show Gist options
  • Save dvirsky/021992fdf155ec6b93b98d1deefa1e12 to your computer and use it in GitHub Desktop.
Save dvirsky/021992fdf155ec6b93b98d1deefa1e12 to your computer and use it in GitHub Desktop.
Pre-tokenize source code for searching
import re
import itertools
import sys
def snake_case_split(ident):
"""
Split a snake case identifier into words, returning the original ident and its splits as a list
"""
splits = filter(None, re.split('_', ident))
if len(splits) <= 1:
return [ident]
splits.append(ident)
return splits
def camel_case_split(ident):
"""
Split a camel-case identifier into words, returning the original ident and its splits
"""
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', ident)
splits = [m.group(0) for m in matches]
if len(splits) <= 1:
return [ident]
splits.append(ident)
return splits
def tokenize_code(fileName):
"""
Open a file name and return it's expanded tokenized version by removing non alphanumeric stuff,
and splitting camel/snake case
"""
with open(fileName) as f:
lines = []
for line in f:
toks = filter(None, re.split('\W', line ))
if not toks:
continue
toks = itertools.chain(*(camel_case_split(x) for x in toks))
toks = itertools.chain(*(snake_case_split(x) for x in toks))
lines.append(toks)
return lines
if __name__ == '__main__':
print('\n'.join((' '.join(line) for line in tokenize_code(sys.argv[1]))))
import re
import itertools
import sys
def snake case split snake_case_split ident
Split a snake case identifier into words returning the original ident and its splits as a list
splits filter None re split _ ident
if len splits 1
return ident
splits append ident
return splits
def camel case split camel_case_split ident
Split a camel case identifier into words returning the original ident and its splits
matches re finditer a z A Z A Z A Z a z ident
splits m group 0 for m in matches
if len splits 1
return ident
splits append ident
return splits
def tokenize code tokenize_code file Name fileName
Open a file name and return it s expanded tokenized version by removing non alphanumeric stuff
and splitting camel snake case
with open file Name fileName as f
lines
for line in f
toks filter None re split W line
if not toks
continue
toks itertools chain camel case split camel_case_split x for x in toks
toks itertools chain snake case split snake_case_split x for x in toks
lines append toks
return lines
if __name__ __main__
print n join join line for line in tokenize code tokenize_code sys argv 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment