Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Created October 22, 2013 10:10
Show Gist options
  • Save andreasvc/7098159 to your computer and use it in GitHub Desktop.
Save andreasvc/7098159 to your computer and use it in GitHub Desktop.
Run a set of XPath queries on a corpus of parse trees and compute precision and recall with respect to a set of hand-picked sentences.
""" Run a set of XPath queries on a corpus of parse trees and compute precision
and recall with respect to a set of hand-picked sentences. """
from __future__ import print_function
import io
import os
import glob
import nltk
import alpinocorpus
PATTERNS = 'patterns.txt' # a file with XPath queries, one per line
CORPUS = 'disco-dop/web/corpus/*.dact' # a directory with .dact files
MACROS = 'disco-dop/web/static/xpathmacros.txt' # macros used in the queries
GOLD = 'gold.txt' # sentences, one per line, of the form 'filename:sentno'
CUTOFF = 500 # only consider this initial segement of sentences; None to include all
def main():
""" Run queries & print results. """
patterns = io.open(PATTERNS, encoding='utf-8').read().splitlines()
gold = set(io.open(GOLD, encoding='utf-8').read().splitlines())
results = {pattern: set() for pattern in patterns}
for filename in glob.glob(CORPUS):
trees = alpinocorpus.CorpusReader(filename, macrosFilename=MACROS)
text = os.path.split(filename)[1].rsplit('.', 1)[0]
for pattern in patterns:
results[pattern].update('%s:%s' % (text,
entry.name().split('.')[0])
for entry in trees.xpath(pattern)
if CUTOFF is None or int(entry.name().split('.')[0]) <= CUTOFF)
combined = set(elem for a in results.values() for elem in a)
for n, pattern in enumerate(results, 1):
evalset('Pattern %d' % n, gold, results[pattern])
print()
evalset('Combined', gold, combined)
def evalset(name, gold, test):
""" Print metrics given two sets. """
if test:
print('%s: Prec.=%g; Recall=%g; F-Measure=%g' % (
name,
nltk.metrics.precision(gold, test),
nltk.metrics.recall(gold, test),
nltk.metrics.f_measure(gold, test)))
else:
print('%s: Prec.=0; Recall=0; F-Measure=0' % name)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment