Created
March 25, 2012 14:58
-
-
Save karlicoss/2196847 to your computer and use it in GitHub Desktop.
Spamlord
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// CS124 HW1 SpamLord | |
import java.util.regex.*; | |
import java.util.Collections; | |
import java.util.Arrays; | |
import java.util.List; | |
import java.util.ArrayList; | |
import java.util.Set; | |
import java.util.HashSet; | |
import java.io.*; | |
public class SpamLord { | |
/* | |
* You do not need to modify anything in the Contact class. | |
* This class encapsulates the basic information associated with | |
* an e-mail or phone number for this assignment. It has three | |
* data members: | |
* filename // the name of the file in which the contact item was found | |
* type // the type of contact information found: either "e" or "p" | |
* value // the actual string representatino of the e-mail or phone number | |
* // see assignment description for details | |
* you can ignore the other functions which are just necessary for correct | |
* behavior when used an element of a java.uitl.Set | |
*/ | |
class Contact implements Comparable<Contact>{ | |
private String fileName; | |
private String type; | |
private String value; | |
public Contact() {} | |
public Contact(String fileName,String type,String value) { | |
this.fileName = fileName; | |
this.type = type; | |
// automatically change value to lower case upon construction; | |
this.value = value.toLowerCase(); | |
} | |
public String getFileName() {return fileName;} | |
public String getType() {return type;} | |
public String getValue() {return value;} | |
@Override | |
public boolean equals(Object o) { | |
Contact c = (Contact) o; | |
return (fileName.equals(c.fileName) && type.equals(c.type) && value.equals(c.value)); | |
} | |
@Override | |
public int hashCode() { | |
return 31*fileName.hashCode() + 17*type.hashCode() + value.hashCode(); | |
} | |
public int compareTo(Contact c) { | |
int fileNameCmp = fileName.compareTo(c.fileName); | |
if (fileNameCmp != 0) { | |
return fileNameCmp; | |
} | |
int typeCmp = type.compareTo(c.type); | |
if (typeCmp != 0) { | |
return typeCmp; | |
} | |
return value.compareTo(c.value); | |
} | |
@Override public String toString() { | |
return fileName + "\t" + type + "\t" + value; | |
} | |
} | |
// Example pattern for extracting e-mail addresses | |
private String dotAlternative = null; | |
{ | |
String[] dotCandidates = {"\\.", | |
";", | |
" dot ", | |
"\\(dot\\)", | |
" d-o-t ", | |
" d o t ", | |
" dom ", | |
"dt", | |
";", | |
".", | |
" "}; | |
dotAlternative = "(?:"; | |
for (int i = 0; i < dotCandidates.length; i++) { | |
dotAlternative += dotCandidates[i]; | |
if (i != dotCandidates.length - 1) | |
dotAlternative += "|"; | |
} | |
dotAlternative += ")"; | |
} | |
private String whitespaceSep = "\\s"; //TODO исключить \n | |
private Pattern emailPattern = null; | |
{ | |
String[] topCandidates = { "com", "gov", "edu", "mil", "tv", "info", | |
"xxx", "travel", "org", "ac", "ad", "ae", "aero", "af", "ag", "ai", | |
"al", "an", "ao", "aq", "ar", "arpa", "as", "asia", "at", | |
"au", "aw", "ax", "az", "ba", "bb", "bd", "be", "bf", "bg", | |
"bh", "bi", "biz", "bj", "bm", "bn", "bo", "br", "bs", "bt", | |
"bv", "bw", "by", "bz", "ca", "cat", "cc", "cd", "cf", "cg", | |
"ch", "ci", "ck", "cl", "cm", "cn", "co", "coop", "cr", | |
"cu", "cv", "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", | |
"do", "dz", "ec", "ee", "eg", "er", "es", "et", "eu", | |
"fi", "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", "ge", | |
"gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", | |
"gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", | |
"ht", "hu", "id", "ie", "il", "im", "int", "io", | |
"iq", "ir", "it", "je", "jm", "jo", "jobs", "jp", "ke", | |
"kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", | |
"la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", | |
"ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", | |
"mm", "mn", "mo", "mobi", "mp", "mq", "mr", "ms", "mt", "mu", | |
"museum", "mv", "mw", "mx", "mz", "na", "name", "nc", | |
"ne", "net", "nf", "ng", "ni", "nl", "np", "nr", "nu", | |
"nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl", | |
"pm", "pn", "pr", "pro", "ps", "pt", "pw", "py", "qa", "re", | |
"ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", | |
"sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st", | |
"su", "sv", "sx", "sy", "sz", "tc", "td", "tel", "tf", "tg", | |
"th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", | |
"tt", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", | |
"va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", | |
"ye", "yt", "za", "zm", "zw" }; | |
String[] atCandidates = {"@", | |
" at ", | |
"\\(at\\)", | |
" a-t ", | |
" a t ", | |
"@", | |
"@", | |
" where "}; | |
String atAlternative = "(?:"; | |
for (int i = 0; i < atCandidates.length; i++) { | |
atAlternative += atCandidates[i]; | |
if (i != atCandidates.length - 1) | |
atAlternative += "|"; | |
} | |
atAlternative += ")"; | |
String tlDomain = "(?:"; | |
for (int i = 0; i < topCandidates.length; i++) { | |
tlDomain += topCandidates[i]; | |
if (i != topCandidates.length - 1) | |
tlDomain += "|"; | |
} | |
tlDomain += ")"; | |
String domain = "\\w{2,}"; | |
String name = "\\w+(?:\\.\\w+)?"; | |
String contactName = "(" + name + ")"; | |
//String domainName = "(" + "(?:" + domain + "(?:" + dotAlternative + "|" + whitespaceSep + "?" + dotAlternative + ")" + whitespaceSep + "?" + "){1,3}" + tlDomain + ")" + "[^\\w]"; | |
String domainName = "(" + | |
"(?:" + domain + | |
"(?:" + dotAlternative + "|" + whitespaceSep + dotAlternative + ")" | |
+ "){1,3}" | |
+ tlDomain + ")" + "[^\\w]"; | |
emailPattern = Pattern.compile("(?i)" + contactName + | |
whitespaceSep + "?" + atAlternative + whitespaceSep + "?" + | |
domainName); | |
} | |
private String[] delimCandidates = {" ", | |
"-", | |
"-", | |
" ", | |
"\\)"}; | |
String delimAlternative = null; | |
private Pattern telPattern = null; | |
{ | |
delimAlternative = "(?:"; | |
for (int i = 0; i < delimCandidates.length; i++) { | |
delimAlternative += delimCandidates[i]; | |
if (i != delimCandidates.length - 1) | |
delimAlternative += "|"; | |
} | |
delimAlternative += ")"; | |
String prefix = "[^\\d]?(\\(?(\\d{3})\\)?"; | |
String suffix1 = "(\\d{3})"; | |
String suffix2 = "(\\d{4}))"; | |
String suffix3 = "[^\\d]?"; | |
telPattern = Pattern.compile(prefix + delimAlternative + suffix1 + delimAlternative + suffix2 + suffix3); | |
} | |
/* | |
* TODO | |
* This should return a list of Contact objects found in the input. | |
* You can change anything internal to this function but make sure you | |
* leave the interface (arguments and return value) unchanged because | |
* it will be directly called by the submission script. | |
*/ | |
public List<Contact> processFile(String fileName, BufferedReader input) { | |
//System.err.println(fileName); | |
List<Contact> contacts = new ArrayList<Contact>(); | |
// for each line | |
Matcher m; | |
try { | |
for (String line = input.readLine(); line != null; line = input | |
.readLine()) { | |
m = emailPattern.matcher(line); | |
while (m.find()) { | |
String name = m.group(1); | |
String suffix = m.group(2); | |
suffix = suffix.replaceAll(dotAlternative + "+", "."); | |
suffix = suffix.replaceAll(whitespaceSep, ""); | |
//post-processing block | |
suffix = suffix.toLowerCase(); | |
name = name.toLowerCase(); | |
{ | |
if (name.contains("server")) { | |
continue; | |
} | |
Contact contact = new Contact(fileName, "e", name + "@" + suffix); | |
contacts.add(contact); | |
} | |
} | |
m = telPattern.matcher(line); | |
while (m.find()) { | |
String full = m.group(1); | |
System.out.println(full); | |
boolean containsAny = false; | |
for (String s: delimCandidates) { | |
containsAny |= full.contains(s); | |
} | |
if (!containsAny) { | |
continue; | |
} | |
String prefix = m.group(2); | |
String suffix1 = m.group(3); | |
String suffix2 = m.group(4); | |
prefix = prefix.replaceAll("[\\)\\(]", ""); | |
prefix = prefix.replaceAll(delimAlternative, ""); | |
suffix1 = suffix1.replaceAll(delimAlternative, ""); | |
suffix2 = suffix2.replaceAll(delimAlternative, ""); | |
Contact contact = new Contact(fileName, "p", prefix + "-" + | |
suffix1 + "-" + suffix2); | |
contacts.add(contact); | |
} | |
} | |
input.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
System.exit(1); | |
} | |
return contacts; | |
} | |
/* | |
* You should not need to edit this, nor should you alter it's interface | |
* because it will also be called direclty by the submission program | |
*/ | |
public List<Contact> processDir(String dirName) { | |
List<Contact> contacts = new ArrayList<Contact>(); | |
for(File f: new File(dirName).listFiles()) { | |
if (f.getName().startsWith(".")) | |
continue; | |
try { | |
BufferedReader input = new BufferedReader(new FileReader(f)); | |
contacts.addAll(processFile(f.getName(), input)); | |
} catch(IOException e) { | |
e.printStackTrace(); | |
System.exit(1); | |
} | |
} | |
return contacts; | |
} | |
/* | |
* You should not need to edit this function | |
* It simply reads in a tsv gold file and returns a list of | |
* Contacts | |
*/ | |
private List<Contact> loadGold(String goldPath) { | |
List<Contact> gold = new ArrayList<Contact>(); | |
try { | |
BufferedReader input = new BufferedReader(new FileReader(goldPath)); | |
String[] toks; | |
for(String line = input.readLine(); line != null; line = input.readLine()) { | |
toks = line.split("\t"); | |
Contact contact = new Contact(toks[0],toks[1],toks[2]); | |
gold.add(contact); | |
} | |
input.close(); | |
} catch(IOException e) { | |
e.printStackTrace(); | |
System.exit(1); | |
} | |
return gold; | |
} | |
/* | |
* You should not need to edit this. | |
* This is just a utility function which turns a Set into | |
* a sorted list for convenience when looking at the output. | |
*/ | |
private List<Contact> asSortedList(Set<Contact> set) { | |
Contact[] c = new Contact[0]; | |
List<Contact> list = Arrays.asList(set.toArray(c)); | |
Collections.sort(list); | |
return list; | |
} | |
/* | |
* You should not need to edit this. | |
* This takes in two Lists of Contacts and prints out the intersection | |
* and differences, which can be thought of as true positives, false | |
* positives and false negatives. | |
*/ | |
private void score(List<Contact> guesses, List<Contact> gold) { | |
Set<Contact> guess_set = new HashSet<Contact>(); | |
guess_set.addAll(guesses); | |
Set<Contact> gold_set = new HashSet<Contact>(); | |
gold_set.addAll(gold); | |
Set<Contact> tp = new HashSet<Contact>(guess_set); | |
System.out.println("guess_set.size()="+guess_set.size()+"\tgold_set.size()="+gold_set.size()); | |
tp.retainAll(gold_set); | |
List<Contact> tp_list = asSortedList(tp); | |
System.out.println("True Positives (" + tp_list.size() +")\t###############################"); | |
for (Contact contact : tp_list) { | |
System.out.println(contact); | |
} | |
Set<Contact> fp = new HashSet<Contact>(guess_set); | |
fp.removeAll(gold_set); | |
List<Contact> fp_list = asSortedList(fp); | |
System.out.println("False Positives (" + fp_list.size() +")\t###############################"); | |
for (Contact contact : fp_list) { | |
System.out.println(contact); | |
} | |
Set<Contact> fn = new HashSet<Contact>(gold_set); | |
fn.removeAll(guess_set); | |
List<Contact> fn_list = asSortedList(fn); | |
System.out.println("False Negatives (" + fn_list.size() +")\t###############################"); | |
for (Contact contact : fn_list) { | |
System.out.println(contact); | |
} | |
System.out.println("Summary: tp=" + tp.size() + "\tfp=" + fp.size() + "\tfn=" + fn.size()); | |
} | |
/* | |
* main takes a directory and a file with the Gold contacts. | |
* it processes each file in the directory, extracting any contacts | |
* and compares them to the contacts listed in the gold file | |
*/ | |
public static void main(String[] args) { | |
if (args.length != 2) { | |
System.err.println("usage:\tSpamLord <data_dir> <gold_file>"); | |
System.exit(0); | |
} | |
SpamLord vader = new SpamLord(); | |
List<Contact> guesses = vader.processDir(args[0]); | |
List<Contact> gold = vader.loadGold(args[1]); | |
vader.score(guesses,gold); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment