Created
October 22, 2013 10:10
-
-
Save andreasvc/7098159 to your computer and use it in GitHub Desktop.
Run a set of XPath queries on a corpus of parse trees and compute precision
and recall with respect to a set of hand-picked sentences.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Run a set of XPath queries on a corpus of parse trees and compute precision | |
and recall with respect to a set of hand-picked sentences. """ | |
from __future__ import print_function | |
import io | |
import os | |
import glob | |
import nltk | |
import alpinocorpus | |
PATTERNS = 'patterns.txt' # a file with XPath queries, one per line | |
CORPUS = 'disco-dop/web/corpus/*.dact' # a directory with .dact files | |
MACROS = 'disco-dop/web/static/xpathmacros.txt' # macros used in the queries | |
GOLD = 'gold.txt' # sentences, one per line, of the form 'filename:sentno' | |
CUTOFF = 500 # only consider this initial segement of sentences; None to include all | |
def main(): | |
""" Run queries & print results. """ | |
patterns = io.open(PATTERNS, encoding='utf-8').read().splitlines() | |
gold = set(io.open(GOLD, encoding='utf-8').read().splitlines()) | |
results = {pattern: set() for pattern in patterns} | |
for filename in glob.glob(CORPUS): | |
trees = alpinocorpus.CorpusReader(filename, macrosFilename=MACROS) | |
text = os.path.split(filename)[1].rsplit('.', 1)[0] | |
for pattern in patterns: | |
results[pattern].update('%s:%s' % (text, | |
entry.name().split('.')[0]) | |
for entry in trees.xpath(pattern) | |
if CUTOFF is None or int(entry.name().split('.')[0]) <= CUTOFF) | |
combined = set(elem for a in results.values() for elem in a) | |
for n, pattern in enumerate(results, 1): | |
evalset('Pattern %d' % n, gold, results[pattern]) | |
print() | |
evalset('Combined', gold, combined) | |
def evalset(name, gold, test): | |
""" Print metrics given two sets. """ | |
if test: | |
print('%s: Prec.=%g; Recall=%g; F-Measure=%g' % ( | |
name, | |
nltk.metrics.precision(gold, test), | |
nltk.metrics.recall(gold, test), | |
nltk.metrics.f_measure(gold, test))) | |
else: | |
print('%s: Prec.=0; Recall=0; F-Measure=0' % name) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment