These are XPath macros used in our DSH paper on physical descriptions of appearance.
- uiterlijkN = looksN
- uiterlijkA = looksA
- persoon = person
- kleding = clothing
"""A baseline Bag-of-Words text classification. | |
Usage: python3 classify.py <train.txt> <test.txt> [--svm] [--tfidf] [--bigrams] | |
train.txt and test.txt should contain one "document" per line, | |
first token should be the label. | |
The default is to use regularized Logistic Regression and relative frequencies. | |
Pass --svm to use Linear SVM instead. | |
Pass --tfidf to use tf-idf instead of relative frequencies. | |
Pass --bigrams to use bigrams instead of unigrams. | |
""" |
"""Prepare https://benjaminvdb.github.io/110kDBRD/ for use with fastText. | |
Divide train set into 90% train and 10% dev, balance positive and negative | |
rewiews, and shuffle. Write result in fastText format.""" | |
import os | |
import re | |
import random | |
import glob | |
from syntok.tokenizer import Tokenizer |
"""Convert XML output of Stanford CoreNLP to CoNLL 2012 format. | |
$ ./corenlp.sh -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref \ | |
-output.printSingletonEntities true \ | |
-file /tmp/example.txt | |
$ python3 corenlpxmltoconll2012.py example.txt.xml > example.conll` | |
""" | |
import re | |
import sys | |
from lxml import etree |
<?xml version='1.0' encoding='UTF-8'?> | |
<volume id="W13"> | |
<paper id="5700"> | |
<title>Proceedings of The 13th International Conference on Parsing Technologies (IWPT 2013)</title> | |
<editor><first>Harry</first><last>Bunt</last></editor> | |
<editor><first>Khalil</first><last>Sima'an</last></editor> | |
<editor><first>Liang</first><last>Huang</last></editor> | |
<month>November</month> | |
<year>2013</year> | |
<address>Nara, Japan</address> |
"""Compute complexity metrics from Universal Dependencies. | |
Usage: python3 udstyle.py [OPTIONS] FILE... | |
--parse=LANG parse texts with Stanza; provide 2 letter language code | |
--output=FILENAME write result to a tab-separated file. | |
--persentence report per sentence results, not mean per document | |
Reported metrics: | |
- LEN: mean sentence length in words (excluding punctuation). | |
- MDD: mean dependency distance (Gibson, 1998). | |
- NDD: normalized dependency distance (Lei & Jockers, 2018). |
"""Rename numeric entity labels in .xmi file to text of first mention. | |
Usage: python3 xmientityrename.py <FILE>... | |
Original file is modified in-place. | |
Only non-empty entities with numeric names are changed. | |
See https://github.com/nilsreiter/CorefAnnotator/issues/173""" | |
import os | |
import sys | |
from lxml import etree |
"""Preprocess movie review polarity dataset v2.0. | |
http://www.cs.cornell.edu/people/pabo/movie-review-data/ | |
""" | |
import os | |
import re | |
import glob | |
import random | |
from syntok.tokenizer import Tokenizer | |
def process(path, pattern, out): |
"""Run with python -c 'import pyximport; pyximport.install(); import cellbench; cellbench.main()' | |
""" | |
from libc.stdint cimport uint32_t | |
from libc.math cimport sqrt, modf | |
from libc.math cimport round as c_round | |
ctypedef uint32_t Label | |
cdef inline size_t cellidx(short start, short end, short lensent, | |
Label nonterminals): |
"""Advent of Code 2017. http://adventofcode.com/2017 """ | |
import sys | |
import array | |
from collections import Counter, defaultdict | |
from operator import xor | |
from functools import reduce | |
from itertools import count | |
from binascii import hexlify | |
import numpy as np |