Created
May 26, 2017 10:39
-
-
Save vmarkovtsev/6371e34cd6b7e895fbb31376d4eedbfa to your computer and use it in GitHub Desktop.
Identifier splitting algorithm from the paper "Topic modeling of public repositories at scale using names in source code"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
NAME_BREAKUP_RE = re.compile(r"[^a-zA-Z]+") | |
def extract_names(token): | |
token = token.strip() | |
prev_p = [""] | |
def ret(name): | |
r = name.lower() | |
if len(name) >= 3: | |
yield r | |
if prev_p[0]: | |
yield prev_p[0] + r | |
prev_p[0] = "" | |
else: | |
prev_p[0] = r | |
for part in NAME_BREAKUP_RE.split(token): | |
if not part: | |
continue | |
prev = part[0] | |
pos = 0 | |
for i in range(1, len(part)): | |
this = part[i] | |
if prev.islower() and this.isupper(): | |
yield from ret(part[pos:i]) | |
pos = i | |
elif prev.isupper() and this.islower(): | |
if 0 < i - 1 - pos <= 3: | |
yield from ret(part[pos:i - 1]) | |
pos = i - 1 | |
elif i - 1 > pos: | |
yield from ret(part[pos:i]) | |
pos = i | |
prev = this | |
last = part[pos:] | |
if last: | |
yield from ret(last) | |
print(list(extract_names("foo_BAR"))) | |
print(list(extract_names("methodBase"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment