Skip to content

Instantly share code, notes, and snippets.

@saswata-dutta
Created August 16, 2020 18:40
Show Gist options
  • Save saswata-dutta/6decc2c35ef44b9de627d22d99ead59c to your computer and use it in GitHub Desktop.
Save saswata-dutta/6decc2c35ef44b9de627d22d99ead59c to your computer and use it in GitHub Desktop.
import re
import sys
def getWords(subject):
return re.sub(r"\W", " ", subject).split()
def isDigit(ch):
return ch.isdigit()
def isIdChar(ch):
return 'A' <= ch <= 'Z' or '0' <= ch <= '9'
def maybeAccountId(word):
return 7 < len(word) < 15 and all(isIdChar(ch) for ch in word) and any(isDigit(ch) for ch in word)
def getPrefixBiGram(index, items):
start = max(0, index - 2)
return " ".join(items[start : index]).strip().lower()
def getPrefixBiGrams(needle, haystack):
indices = [i for i, x in enumerate(haystack) if x == needle]
return list(map(lambda i: getPrefixBiGram(i, haystack), indices))
def process(data):
truth, threadSubject = data.split(",")
words = getWords(threadSubject)
candidates = set(filter(maybeAccountId, words))
for item in candidates:
for biGram in getPrefixBiGrams(item, words):
print(f"{item == truth},{biGram}")
def main():
filepath = sys.argv[1]
with open(filepath) as fp:
for line in fp:
data = line.strip()
if data:
process(data)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment