Created
November 25, 2015 23:48
-
-
Save glennq/74da9eebb21a0944b722 to your computer and use it in GitHub Desktop.
Text-iq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.IOException; | |
import java.nio.file.Files; | |
import java.nio.file.StandardOpenOption; | |
import java.util.*; | |
public class EmailParsingFiniteStateMachine { | |
private String[] lines; | |
private HashMap<Integer, String> labelMapping; | |
private HashSet<State> metaStatesSeen; | |
private boolean hasUnknown; | |
private enum State { | |
DATE("D"), | |
SUBJECT("S"), | |
FROM("F"), | |
TO("T"), | |
CC("CC"), | |
CONTENT("C"), | |
UNKNOWN("U"); | |
private final String label; | |
private State(String label) { | |
this.label = label; | |
} | |
@Override | |
public String toString() { | |
return label; | |
} | |
} | |
public EmailParsingFiniteStateMachine(String document) { | |
lines = document.split("\n"); | |
labelMapping = new HashMap<Integer, String>(); | |
metaStatesSeen = new HashSet<State>(); | |
hasUnknown = false; | |
parseLines(); | |
} | |
public String getLabelForLineNum(int lineNum) { | |
return labelMapping.get(lineNum); | |
} | |
public int getNumOfLines() { | |
return lines.length; | |
} | |
public String getLine(int lineNum) { | |
return lines[lineNum]; | |
} | |
public boolean containsUnknown() { | |
return hasUnknown; | |
} | |
private void parseLines() { | |
State state = State.UNKNOWN; | |
for (int i = 0; i < lines.length; i++) { | |
String line = lines[i]; | |
state = getLineState(line, state); | |
if (state == State.UNKNOWN) { | |
hasUnknown = true; | |
} | |
labelMapping.put(i, state.toString()); | |
} | |
} | |
private State getLineState(String line, State curState) { | |
String trimmedLine = line.trim(); | |
if (trimmedLine.startsWith("Subject: ")) { | |
metaStatesSeen.add(State.SUBJECT); | |
return State.SUBJECT; | |
} else if (trimmedLine.startsWith("Date: ")) { | |
metaStatesSeen.add(State.DATE); | |
return State.DATE; | |
} else if (trimmedLine.startsWith("From: ")) { | |
metaStatesSeen.add(State.FROM); | |
return State.FROM; | |
} else if (trimmedLine.startsWith("To: ")) { | |
metaStatesSeen.add(State.TO); | |
return State.TO; | |
} else if (trimmedLine.startsWith("Cc: ")) { | |
metaStatesSeen.add(State.CC); | |
return State.CC; | |
} else if (trimmedLine.equals("") && curState != State.UNKNOWN && | |
isMetaComplete()) { | |
metaStatesSeen.clear(); | |
return State.CONTENT; | |
} else if (curState == State.CC || curState == State.TO || | |
curState == State.CONTENT) { | |
return curState; | |
} | |
metaStatesSeen.clear(); | |
return State.UNKNOWN; | |
} | |
private boolean isMetaComplete() { | |
return (metaStatesSeen.contains(State.FROM) && | |
metaStatesSeen.contains(State.TO) && | |
metaStatesSeen.contains(State.SUBJECT)); | |
} | |
public static void main(String[] args) { | |
File inputDir = new File(args[0]); | |
File outputDir = new File(args[1]); | |
File[] inputFiles = inputDir.listFiles(); | |
if (!outputDir.exists()) { | |
outputDir.mkdir(); | |
} | |
for (File inputFile : inputFiles) { | |
String inputText = null; | |
try { | |
inputText = new String(Files.readAllBytes(inputFile.toPath())); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
EmailParsingFiniteStateMachine fsm = | |
new EmailParsingFiniteStateMachine(inputText); | |
StringBuilder output = new StringBuilder(); | |
for (int i = 0; i < fsm.getNumOfLines(); i++) { | |
if (fsm.containsUnknown()) { | |
output.append("U:" + fsm.getLine(i) + "\n"); | |
} else { | |
output.append( | |
fsm.getLabelForLineNum(i) + ":" + fsm.getLine(i) + "\n"); | |
} | |
} | |
String outputName = (fsm.containsUnknown() ? "TEST_" : "TRAIN_") + | |
inputFile.getName(); | |
File outputFile = new File(outputDir, outputName); | |
try { | |
Files.write(outputFile.toPath(), output.toString().getBytes(), | |
StandardOpenOption.CREATE); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
System.out.println("Parsed file " + inputFile.getName()); | |
} | |
} | |
// public static void main(String[] args) { | |
// File inputFile = new File("/Users/glenn/myProject/textiq/ENG_00002116.txt"); | |
// String inputText = null; | |
// | |
// try { | |
// inputText = new String(Files.readAllBytes(inputFile.toPath())); | |
// } catch (IOException e) { | |
// e.printStackTrace(); | |
// } | |
// | |
// EmailParsingFiniteStateMachine fsm = | |
// new EmailParsingFiniteStateMachine(inputText); | |
// | |
// for (int i = 0; i < fsm.getNumOfLines(); i++) { | |
// System.out.println(fsm.getLabelForLineNum(i) + ":" + fsm.getLine(i)); | |
// } | |
// } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import pycrfsuite | |
from sklearn.cross_validation import train_test_split | |
from utils import get_train_raw, add_noise, to_bow | |
def preprocess(train_raw, train_meta): | |
text_train = [x[0] for x in train_raw] | |
label_train = [x[1] for x in train_raw] | |
seq_train_X = [] | |
seq_train_y = [] | |
splits = train_meta + [len(train_raw)] | |
for i, split in enumerate(splits[:-1]): | |
seq_X = text_train[splits[i]:splits[i+1]] | |
seq_y = label_train[splits[i]:splits[i+1]] | |
seq_train_X.append(seq_X) | |
seq_train_y.append(seq_y) | |
return seq_train_X, seq_train_y | |
def evaluate(tagger, valid_X, valid_y, valid_raw): | |
corr_cnt = 0 | |
total_cnt = 0 | |
for seqx, seqy, seq_raw in zip(valid_X, valid_y, valid_raw): | |
pred = tagger.tag(seqx) | |
for i, (y_p, y_t) in enumerate(zip(pred, seqy)): | |
total_cnt += 1 | |
if y_p == y_t: | |
corr_cnt += 1 | |
else: | |
print seq_raw[i], y_p, y_t | |
return float(corr_cnt) / total_cnt | |
def main(args): | |
dir_path = args[0] | |
train_raw, train_meta = get_train_raw(dir_path) | |
seq_train_X, seq_train_y = preprocess(train_raw, train_meta) | |
train_X, valid_X, train_y, valid_y = train_test_split( | |
seq_train_X, seq_train_y, test_size=0.3) | |
add_noise(valid_X) | |
bow_train_X = [pycrfsuite.ItemSequence([to_bow(line) for line in doc]) | |
for doc in train_X] | |
bow_valid_X = [pycrfsuite.ItemSequence([to_bow(line) for line in doc]) | |
for doc in valid_X] | |
trainer = pycrfsuite.Trainer() | |
for seqx, seqy in zip(bow_train_X, train_y): | |
trainer.append(seqx, seqy) | |
trainer.train('line_tagger') | |
tagger = pycrfsuite.Tagger() | |
tagger.open('line_tagger') | |
print evaluate(tagger, bow_valid_X, valid_y, valid_X) | |
if __name__ == '__main__': | |
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from sklearn.feature_extraction import DictVectorizer | |
from sklearn.linear_model import LogisticRegressionCV | |
from utils import get_train | |
def main(args): | |
dir_path = args[0] | |
train, train_meta = get_train(dir_path) | |
vec = DictVectorizer() | |
train_y = [x[1] for x in train] | |
train_X = vec.fit_transform([x[0] for x in train]) | |
clf = LogisticRegressionCV([10 ** i for i in range(-3, 4)]) | |
clf.fit(train_X, train_y) | |
print clf.scores_ | |
if __name__ == '__main__': | |
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import random | |
import string | |
from collections import Counter | |
import numpy as np | |
from nltk.tokenize import word_tokenize | |
def get_train(dir_path): | |
train = [] | |
files = os.listdir(dir_path) | |
start = 0 | |
train_meta = [] | |
for fname in files: | |
if fname.startswith('TRAIN_'): | |
with open(os.path.join(dir_path, fname), 'rb') as f: | |
lines = f.readlines() | |
for line in lines: | |
label, text = line.split(':', 1) | |
bow = to_bow(text) | |
train.append((bow, label)) | |
train_meta.append(start) | |
start = len(train) | |
return train, train_meta | |
def get_test(dir_path): | |
test = [] | |
files = os.listdir(dir_path) | |
start = 0 | |
test_meta = [] | |
for fname in files: | |
if fname.startswith('TEST_'): | |
with open(os.path.join(dir_path, fname), 'rb') as f: | |
lines = f.readlines() | |
for line in lines: | |
_, text = line.split(':', 1) | |
bow = to_bow(text) | |
test.append(bow) | |
test_meta.append(start) | |
start = len(test) | |
return test, test_meta | |
def to_bow(text): | |
words = [word.lower() for word in word_tokenize(text)] | |
return Counter(words) | |
def get_train_raw(dir_path): | |
train = [] | |
files = os.listdir(dir_path) | |
start = 0 | |
train_meta = [] | |
for fname in files: | |
if fname.startswith('TRAIN_'): | |
with open(os.path.join(dir_path, fname), 'rb') as f: | |
lines = f.readlines() | |
for line in lines: | |
label, text = line.split(':', 1) | |
train.append((text, label)) | |
train_meta.append(start) | |
start = len(train) | |
return train, train_meta | |
def get_random_string(length): | |
return ''.join([random.choice(string.ascii_uppercase + string.digits) | |
for _ in range(length)]) | |
def add_noise(X): | |
for seqx in X: | |
seq_len = len(seqx) | |
for i in range(seq_len / 2): | |
index = np.random.randint(seq_len) | |
line = seqx[index] | |
position = np.random.randint(len(line)) | |
length = np.random.randint(3, 9) | |
seqx[index] = (line[:position] + get_random_string(length) + | |
line[position+1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment