dvirsky · June 28, 2017 08:19
diff --git a/tokenize.py b/tokenize.py
 import re
 import itertools
 import sys


 def snake_case_split(ident):
    """
    Split a snake case identifier into words, returning the original ident and its splits as a list
    """
    splits = filter(None, re.split('_', ident))
    if len(splits) <= 1:
         return [ident]
    splits.append(ident)
    return splits

 def camel_case_split(ident):
    """
    Split a camel-case identifier into words, returning the original ident and its splits 
    """
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', ident)

    splits = [m.group(0) for m in matches]
    if len(splits) <= 1:
        return [ident]
    splits.append(ident)
    return splits

 def tokenize_code(fileName):
    """
    Open a file name and return it's expanded tokenized version by removing non alphanumeric stuff, 
    and splitting camel/snake case
    """
    with open(fileName) as f:
        lines = []
        for line in f:
            toks = filter(None, re.split('\W', line ))
            if not toks:
                continue
            toks = itertools.chain(*(camel_case_split(x) for x in toks))
            toks = itertools.chain(*(snake_case_split(x) for x in toks))
            
            lines.append(toks)
    return lines


 if __name__ == '__main__':

    print('\n'.join((' '.join(line) for line in tokenize_code(sys.argv[1]))))
diff --git a/tokenized version of the above code b/tokenized version of the above code
 import re
 import itertools
 import sys
 def snake case split snake_case_split ident
 Split a snake case identifier into words returning the original ident and its splits as a list
 splits filter None re split _ ident
 if len splits 1
 return ident
 splits append ident
 return splits
 def camel case split camel_case_split ident
 Split a camel case identifier into words returning the original ident and its splits
 matches re finditer a z A Z A Z A Z a z ident
 splits m group 0 for m in matches
 if len splits 1
 return ident
 splits append ident
 return splits
 def tokenize code tokenize_code file Name fileName
 Open a file name and return it s expanded tokenized version by removing non alphanumeric stuff
 and splitting camel snake case
 with open file Name fileName as f
 lines
 for line in f
 toks filter None re split W line
 if not toks
 continue
 toks itertools chain camel case split camel_case_split x for x in toks
 toks itertools chain snake case split snake_case_split x for x in toks
 lines append toks
 return lines
 if __name__ __main__
 print n join join line for line in tokenize code tokenize_code sys argv 1
	import re
	import itertools
	import sys


	def snake_case_split(ident):
	"""
	Split a snake case identifier into words, returning the original ident and its splits as a list
	"""
	splits = filter(None, re.split('_', ident))
	if len(splits) <= 1:
	return [ident]
	splits.append(ident)
	return splits

	def camel_case_split(ident):
	"""
	Split a camel-case identifier into words, returning the original ident and its splits
	"""
	matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])\|(?<=[A-Z])(?=[A-Z][a-z])\|$)', ident)

	splits = [m.group(0) for m in matches]
	if len(splits) <= 1:
	return [ident]
	splits.append(ident)
	return splits

	def tokenize_code(fileName):
	"""
	Open a file name and return it's expanded tokenized version by removing non alphanumeric stuff,
	and splitting camel/snake case
	"""
	with open(fileName) as f:
	lines = []
	for line in f:
	toks = filter(None, re.split('\W', line ))
	if not toks:
	continue
	toks = itertools.chain(*(camel_case_split(x) for x in toks))
	toks = itertools.chain(*(snake_case_split(x) for x in toks))

	lines.append(toks)
	return lines


	if __name__ == '__main__':

	print('\n'.join((' '.join(line) for line in tokenize_code(sys.argv[1]))))
No results found