These are XPath macros used in our DSH paper on physical descriptions of appearance.
- uiterlijkN = looksN
- uiterlijkA = looksA
- persoon = person
- kleding = clothing
| """A baseline Bag-of-Words text classification. | |
| Usage: python3 classify.py <train.txt> <test.txt> [--svm] [--tfidf] [--bigrams] | |
| train.txt and test.txt should contain one "document" per line, | |
| first token should be the label. | |
| The default is to use regularized Logistic Regression and relative frequencies. | |
| Pass --svm to use Linear SVM instead. | |
| Pass --tfidf to use tf-idf instead of relative frequencies. | |
| Pass --bigrams to use bigrams instead of unigrams. | |
| """ |
| """Prepare https://benjaminvdb.github.io/110kDBRD/ for use with fastText. | |
| Divide train set into 90% train and 10% dev, balance positive and negative | |
| rewiews, and shuffle. Write result in fastText format.""" | |
| import os | |
| import re | |
| import random | |
| import glob | |
| from syntok.tokenizer import Tokenizer |
| """Convert XML output of Stanford CoreNLP to CoNLL 2012 format. | |
| $ ./corenlp.sh -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref \ | |
| -output.printSingletonEntities true \ | |
| -file /tmp/example.txt | |
| $ python3 corenlpxmltoconll2012.py example.txt.xml > example.conll` | |
| """ | |
| import re | |
| import sys | |
| from lxml import etree |
| <?xml version='1.0' encoding='UTF-8'?> | |
| <volume id="W13"> | |
| <paper id="5700"> | |
| <title>Proceedings of The 13th International Conference on Parsing Technologies (IWPT 2013)</title> | |
| <editor><first>Harry</first><last>Bunt</last></editor> | |
| <editor><first>Khalil</first><last>Sima'an</last></editor> | |
| <editor><first>Liang</first><last>Huang</last></editor> | |
| <month>November</month> | |
| <year>2013</year> | |
| <address>Nara, Japan</address> |
| """Compute complexity metrics from Universal Dependencies. | |
| Usage: python3 udstyle.py [OPTIONS] FILE... | |
| --parse=LANG parse texts with Stanza; provide 2 letter language code | |
| --output=FILENAME write result to a tab-separated file. | |
| --persentence report per sentence results, not mean per document | |
| Reported metrics: | |
| - LEN: mean sentence length in words (excluding punctuation). | |
| - MDD: mean dependency distance (Gibson, 1998). | |
| - NDD: normalized dependency distance (Lei & Jockers, 2018). |
| """Rename numeric entity labels in .xmi file to text of first mention. | |
| Usage: python3 xmientityrename.py <FILE>... | |
| Original file is modified in-place. | |
| Only non-empty entities with numeric names are changed. | |
| See https://github.com/nilsreiter/CorefAnnotator/issues/173""" | |
| import os | |
| import sys | |
| from lxml import etree |
| """Preprocess movie review polarity dataset v2.0. | |
| http://www.cs.cornell.edu/people/pabo/movie-review-data/ | |
| """ | |
| import os | |
| import re | |
| import glob | |
| import random | |
| from syntok.tokenizer import Tokenizer | |
| def process(path, pattern, out): |
| """Run with python -c 'import pyximport; pyximport.install(); import cellbench; cellbench.main()' | |
| """ | |
| from libc.stdint cimport uint32_t | |
| from libc.math cimport sqrt, modf | |
| from libc.math cimport round as c_round | |
| ctypedef uint32_t Label | |
| cdef inline size_t cellidx(short start, short end, short lensent, | |
| Label nonterminals): |
| """Advent of Code 2017. http://adventofcode.com/2017 """ | |
| import sys | |
| import array | |
| from collections import Counter, defaultdict | |
| from operator import xor | |
| from functools import reduce | |
| from itertools import count | |
| from binascii import hexlify | |
| import numpy as np |