Andreas van Cranenburgh andreasvc

Word lists for extraction of physical descriptions

These are XPath macros used in our DSH paper on physical descriptions of appearance.

	"""A baseline Bag-of-Words text classification.

	Usage: python3 classify.py <train.txt> <test.txt> [--svm] [--tfidf] [--bigrams]
	train.txt and test.txt should contain one "document" per line,
	first token should be the label.
	The default is to use regularized Logistic Regression and relative frequencies.
	Pass --svm to use Linear SVM instead.
	Pass --tfidf to use tf-idf instead of relative frequencies.
	Pass --bigrams to use bigrams instead of unigrams.
	"""

	"""Prepare https://benjaminvdb.github.io/110kDBRD/ for use with fastText.

	Divide train set into 90% train and 10% dev, balance positive and negative
	rewiews, and shuffle. Write result in fastText format."""
	import os
	import re
	import random
	import glob
	from syntok.tokenizer import Tokenizer

	"""Convert XML output of Stanford CoreNLP to CoNLL 2012 format.

	$ ./corenlp.sh -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref \
	-output.printSingletonEntities true \
	-file /tmp/example.txt
	$ python3 corenlpxmltoconll2012.py example.txt.xml > example.conll`
	"""
	import re
	import sys
	from lxml import etree

	<?xml version='1.0' encoding='UTF-8'?>
	<volume id="W13">
	<paper id="5700">
	<title>Proceedings of The 13th International Conference on Parsing Technologies (IWPT 2013)</title>
	<editor><first>Harry</first><last>Bunt</last></editor>
	<editor><first>Khalil</first><last>Sima'an</last></editor>
	<editor><first>Liang</first><last>Huang</last></editor>
	<month>November</month>
	<year>2013</year>
	<address>Nara, Japan</address>

	"""Compute complexity metrics from Universal Dependencies.

	Usage: python3 udstyle.py [OPTIONS] FILE...
	--parse=LANG parse texts with Stanza; provide 2 letter language code
	--output=FILENAME write result to a tab-separated file.
	--persentence report per sentence results, not mean per document
	Reported metrics:
	- LEN: mean sentence length in words (excluding punctuation).
	- MDD: mean dependency distance (Gibson, 1998).
	- NDD: normalized dependency distance (Lei & Jockers, 2018).

	"""Rename numeric entity labels in .xmi file to text of first mention.

	Usage: python3 xmientityrename.py <FILE>...
	Original file is modified in-place.
	Only non-empty entities with numeric names are changed.
	See https://github.com/nilsreiter/CorefAnnotator/issues/173"""
	import os
	import sys
	from lxml import etree

	"""Preprocess movie review polarity dataset v2.0.
	http://www.cs.cornell.edu/people/pabo/movie-review-data/
	"""
	import os
	import re
	import glob
	import random
	from syntok.tokenizer import Tokenizer

	def process(path, pattern, out):

	"""Run with python -c 'import pyximport; pyximport.install(); import cellbench; cellbench.main()'
	"""

	from libc.stdint cimport uint32_t
	from libc.math cimport sqrt, modf
	from libc.math cimport round as c_round
	ctypedef uint32_t Label

	cdef inline size_t cellidx(short start, short end, short lensent,
	Label nonterminals):

	"""Advent of Code 2017. http://adventofcode.com/2017 """
	import sys
	import array
	from collections import Counter, defaultdict
	from operator import xor
	from functools import reduce
	from itertools import count
	from binascii import hexlify
	import numpy as np