Last active
January 11, 2018 12:50
-
-
Save dustalov/d9830eee6a9350ffac0559abc3a02b1f to your computer and use it in GitHub Desktop.
Extracting and cross-validating the WCL dataset of the 1.0 version
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
BEGIN { | |
FS = ":"; | |
} | |
/^#/ { | |
next; | |
} | |
{ | |
sub(/^[!$]+/, ""); | |
gsub(/<(|\/)(VERB|GENUS|HYPER|LST|PRT|RGET|REST)>/, ""); | |
if (length(DEFINIENDUM)) printf "%s\t", $1; | |
for (i = 2; i <= NF; i++) { | |
if (i > 2) printf " : "; | |
for (j = 1; j <= split($i, words, "\t"); j++) { | |
sub(/^.+_/, "", words[j]); | |
if (words[j] == "") continue; | |
if (words[j] == "TARGET") words[j] = $1; | |
if (j > 1) printf " "; | |
printf "%s", words[j]; | |
} | |
} | |
print ""; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
NR % 2 == 1 { | |
comment = $0; | |
next; | |
} | |
/^[^!]/ && length(GOOD) > 0 { | |
print comment; | |
print $0; | |
} | |
/^!/ && length(BAD) > 0 { | |
print comment; | |
print $0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import namedtuple | |
Sentence = namedtuple('Sentence', 'comment tokens good') | |
def read_sentences(f): | |
sentences, comment = [], None | |
for i, line in enumerate(f): | |
if i % 2 == 0: | |
comment = line | |
else: | |
sentences.append(Sentence(comment, line, not line.startswith('!'))) | |
return sentences | |
def write_fold(filename, sentences, indices): | |
with open(filename, 'w', encoding='UTF-8', newline='') as f: | |
for i in indices: | |
f.write(sentences[i].comment) | |
f.write(sentences[i].tokens) | |
if __name__ == '__main__': | |
import argparse | |
from sklearn.model_selection import KFold | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-k', type=int, default=10) | |
parser.add_argument('--seed', type=int, default=None) | |
parser.add_argument('--prefix', default='wcl-kfold-') | |
parser.add_argument('wcl', type=argparse.FileType('r', encoding='UTF-8')) | |
args = parser.parse_args() | |
kf = KFold(n_splits=args.k, shuffle=True, random_state=args.seed) | |
sentences = read_sentences(args.wcl) | |
for i, (train_index, test_index) in enumerate(kf.split(sentences)): | |
write_fold('%s%d-train.pos' % (args.prefix, i + 1), sentences, train_index) | |
write_fold('%s%d.test' % (args.prefix, i + 1), sentences, test_index) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
LC_COLLATE = C | |
SEED = 1337 | |
WCL_WRAPPER = /srv/definitions/wcl-extract | |
measure: | |
./measure.py | |
kfold: wiki_really_all.txt | |
./kfold.py --seed=$(SEED) $< | |
eval: | |
for i in $$(seq 10); do (cd $(WCL_WRAPPER) && java -jar target/wcl-wrapper.jar -l en -t $(CURDIR)/wcl-kfold-$$i-train) < $(CURDIR)/wcl-kfold-$$i-test.tsv > $(CURDIR)/wcl-kfold-$$i-test-eval.tsv 2>/dev/null; done | |
extract: | |
for i in $$(seq 10); do ./extract.awk -vDEFINIENDUM=1 wcl-kfold-$$i.test > wcl-kfold-$$i-test.tsv; done | |
wiki_really_all.txt: wiki_really_good.txt wiki_really_bad.txt | |
cat wiki_really_good.txt wiki_really_bad.txt > $@ | |
wiki_really_good.txt: wiki_all.txt | |
./filter.awk -vGOOD=1 wiki_all.txt > $@ | |
wiki_really_bad.txt: wiki_all.txt | |
./filter.awk -vBAD=1 wiki_all.txt > $@ | |
wiki_all.txt: | |
cat wiki_{good,bad}.txt > $@ | |
clean: | |
rm -fv wiki_really*.txt *.train *.test *.tsv *.pos |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import numpy as np | |
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score | |
from kfold import read_sentences | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-k', type=int, default=10) | |
parser.add_argument('--prefix', default='wcl-kfold-') | |
args = parser.parse_args() | |
sentences = {} | |
for i in range(args.k): | |
filename = '%s%d.test' % (args.prefix, i + 1) | |
with open(filename, encoding='UTF-8') as f: | |
sentences[i] = read_sentences(f) | |
trues = {i: [sentence.good for sentence in sentences[i]] for i in range(args.k)} | |
preds = {i: [None] * len(sentences[i]) for i in range(args.k)} | |
for i in range(args.k): | |
filename = '%s%d-test-eval.tsv' % (args.prefix, i + 1) | |
with open(filename, encoding='UTF-8') as f: | |
for j, line in enumerate(f): | |
_, good = line.rstrip().lower().split('\t', 1) | |
preds[i][j] = good == 'true' | |
precision_scores = [precision_score(trues[i], preds[i]) for i in range(args.k)] | |
print('Pr=%.4f±%.4f' % (np.mean(precision_scores), np.std(precision_scores))) | |
recall_scores = [recall_score(trues[i], preds[i]) for i in range(args.k)] | |
print('Re=%.4f±%.4f' % (np.mean(recall_scores), np.std(recall_scores))) | |
f1_scores = [f1_score(trues[i], preds[i]) for i in range(args.k)] | |
print('F1=%.4f±%.4f' % (np.mean(f1_scores), np.std(f1_scores))) | |
accuracy_scores = [accuracy_score(trues[i], preds[i]) for i in range(args.k)] | |
print('Ac=%.4f±%.4f' % (np.mean(accuracy_scores), np.std(accuracy_scores))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment