Last active
November 8, 2015 14:01
-
-
Save ningsuhen/3a1189db064fd1e351b2 to your computer and use it in GitHub Desktop.
Dan Jurafsky's NLP SpamLord
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import re | |
import pprint | |
my_first_pat = '(\w+)@(\w+).edu' | |
emails_pat_str = '(?:^|[ :">]+)((?:-?[a-z]-?){2,}|[a-z0-9\.]+)(?<!Server|cience) ?(?:@|[%20\( ]+at[\) %20]+|WHERE|@|\(followed by .*@) ?(?:((?:-?[a-z]-?){2,}|[a-z0-9-]+) ?(?:\.|;|[\(%20]*dot[\(%20]*|DOM| dt ) ?((?:-?[a-z]-?){2,}|[a-z]+)(?: ?(?:\.|;|\(?dot[\)]?) ?((?:-?[a-z]-?){2,}|[a-z]+))?|cs stanford edu)' | |
phones_pat_str = '(?:\+[0-9]{1,2})? ?\(?([0-9]{3})[\)\- ]+([0-9]{3})[- ]([0-9]{4})' | |
emails_pat_str2 = "<script>[ ]?(?:obfuscate)\('([a-z\.]+)','([a-z\.]+)'\);?[ ]?</script>" | |
""" | |
TODO | |
This function takes in a filename along with the file object (actually | |
a StringIO object at submission time) and | |
scans its contents against regex patterns. It returns a list of | |
(filename, type, value) tuples where type is either an 'e' or a 'p' | |
for e-mail or phone, and value is the formatted phone number or e-mail. | |
The canonical formats are: | |
(name, 'p', '###-###-#####') | |
(name, 'e', 'someone@something') | |
If the numbers you submit are formatted differently they will not | |
match the gold answers | |
NOTE: ***don't change this interface***, as it will be called directly by | |
the submit script | |
NOTE: You shouldn't need to worry about this, but just so you know, the | |
'f' parameter below will be of type StringIO at submission time. So, make | |
sure you check the StringIO interface if you do anything really tricky, | |
though StringIO should support most everything. | |
""" | |
def process_file(name, f): | |
# note that debug info should be printed to stderr | |
# sys.stderr.write('[process_file]\tprocessing file: %s\n' % (path)) | |
res = [] | |
emails_pat = re.compile(emails_pat_str, re.IGNORECASE) | |
phones_pat = re.compile(phones_pat_str, re.IGNORECASE) | |
emails_pat2 = re.compile(emails_pat_str2, re.IGNORECASE) | |
for line in f: | |
matches = emails_pat.findall(line) | |
for m in matches: | |
m = tuple(map(lambda s: re.sub(r'-?([a-z])-+', r'\1', s), m)) | |
if m[3] != "": | |
email = '%s@%s.%s.%s' % m | |
else: | |
if m[1] == "": | |
email = '%[email protected]' % (m[0]) | |
else: | |
email = '%s@%s.%s%s' % m | |
res.append((name, 'e', email)) | |
matches = emails_pat2.findall(line) | |
for m in matches: | |
m = m[::-1] | |
email = '%s@%s' % m | |
res.append((name, 'e', email)) | |
p_matches = phones_pat.findall(line) | |
for m in p_matches: | |
phone = '%s-%s-%s' % m | |
res.append((name, 'p', phone)) | |
return res | |
""" | |
You should not need to edit this function, nor should you alter | |
its interface as it will be called directly by the submit script | |
""" | |
def process_dir(data_path): | |
# get candidates | |
guess_list = [] | |
for fname in os.listdir(data_path): | |
if fname[0] == '.': | |
continue | |
path = os.path.join(data_path, fname) | |
f = open(path, 'r') | |
f_guesses = process_file(fname, f) | |
guess_list.extend(f_guesses) | |
return guess_list | |
""" | |
You should not need to edit this function. | |
Given a path to a tsv file of gold e-mails and phone numbers | |
this function returns a list of tuples of the canonical form: | |
(filename, type, value) | |
""" | |
def get_gold(gold_path): | |
# get gold answers | |
gold_list = [] | |
f_gold = open(gold_path, 'r') | |
for line in f_gold: | |
gold_list.append(tuple(line.strip().split('\t'))) | |
return gold_list | |
""" | |
You should not need to edit this function. | |
Given a list of guessed contacts and gold contacts, this function | |
computes the intersection and set differences, to compute the true | |
positives, false positives and false negatives. Importantly, it | |
converts all of the values to lower case before comparing | |
""" | |
def score(guess_list, gold_list): | |
guess_list = [(fname, _type, value.lower()) for (fname, _type, value) in guess_list] | |
gold_list = [(fname, _type, value.lower()) for (fname, _type, value) in gold_list] | |
guess_set = set(guess_list) | |
gold_set = set(gold_list) | |
tp = guess_set.intersection(gold_set) | |
fp = guess_set - gold_set | |
fn = gold_set - guess_set | |
pp = pprint.PrettyPrinter() | |
# print 'Guesses (%d): ' % len(guess_set) | |
# pp.pprint(guess_set) | |
# print 'Gold (%d): ' % len(gold_set) | |
# pp.pprint(gold_set) | |
print 'True Positives (%d): ' % len(tp) | |
pp.pprint(tp) | |
print 'False Positives (%d): ' % len(fp) | |
pp.pprint(fp) | |
print 'False Negatives (%d): ' % len(fn) | |
pp.pprint(fn) | |
print 'Summary: tp=%d, fp=%d, fn=%d' % (len(tp), len(fp), len(fn)) | |
""" | |
You should not need to edit this function. | |
It takes in the string path to the data directory and the | |
gold file | |
""" | |
def main(data_path, gold_path): | |
guess_list = process_dir(data_path) | |
gold_list = get_gold(gold_path) | |
score(guess_list, gold_list) | |
""" | |
commandline interface takes a directory name and gold file. | |
It then processes each file within that directory and extracts any | |
matching e-mails or phone numbers and compares them to the gold file | |
""" | |
if __name__ == '__main__': | |
if (len(sys.argv) == 1): | |
main('../data/dev', '../data/devGOLD') | |
elif (len(sys.argv) == 3): | |
main(sys.argv[1], sys.argv[2]) | |
else: | |
print 'usage:\tSpamLord.py <data_dir> <gold_file>' | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment