Skip to content

Instantly share code, notes, and snippets.

@dustalov
dustalov / bradley_terry.py
Last active August 27, 2023 16:53
Answer Aggregation with Bradley-Terry
#!/usr/bin/env python3
"""
An implementation of the Bradley-Terry ranking aggregation algorithm from the paper
MM algorithms for generalized Bradley-Terry models
<https://doi.org/10.1214/aos/1079120141>.
"""
__author__ = 'Dmitry Ustalov'
__copyright__ = 'Copyright 2021 Dmitry Ustalov'
@dustalov
dustalov / dirbackup
Last active September 12, 2022 20:48
Miscellaneous scripts for nearly everyday use
#!/bin/sh -eu
CWD=$(basename "$PWD")
XZ_OPT="-T 0" exec tar --exclude '*~' -C ../ -cJvf "../$CWD.tar.xz" "$CWD"
@dustalov
dustalov / Makefile
Last active August 31, 2019 21:31
Chinese Whispers and Telephone Game Performance Evaluation
WATSET ?= ../watset-java/target/watset.jar
LCC ?= ../lcc
export LANG:=en_US.UTF-8
export LC_COLLATE:=C
export CLASSPATH := $(WATSET)
nodes:
cut -f1,2 $(LCC)/eng_news_2016_10K/eng_news_2016_10K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
cut -f1,2 $(LCC)/eng_news_2016_30K/eng_news_2016_30K-co_s.txt | sed -re 's/\t/\n/g' | sort -u | wc -l
@dustalov
dustalov / sigf.py
Last active February 22, 2023 16:50
An implementation of the sigf toolkit for randomization tests in Python 3
#!/usr/bin/env python3
__author__ = 'Dmitry Ustalov'
__credits__ = 'Sebastian Padó'
__license__ = 'MIT'
# This is an MIT-licensed implementation of the sigf toolkit for randomization tests:
# https://nlpado.de/~sebastian/software/sigf.shtml
import random
@dustalov
dustalov / collocation.groovy
Last active June 23, 2019 21:44
Watset (Java) Performance Measurement
#!/usr/bin/env groovy
import org.apache.commons.math3.stat.descriptive.moment.Mean
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation
import org.jgrapht.graph.SimpleWeightedGraph
import org.jgrapht.util.SupplierUtil
import org.nlpub.watset.graph.ChineseWhispers
import org.nlpub.watset.graph.NodeWeighting
import org.nlpub.watset.graph.MaxMax
import org.nlpub.watset.eval.Measurer
import org.nlpub.watset.graph.Watset
@dustalov
dustalov / Makefile
Last active January 11, 2018 12:50
Extracting and cross-validating the WCL dataset of the 1.0 version
LC_COLLATE = C
SEED = 1337
WCL_WRAPPER = /srv/definitions/wcl-extract
measure:
./measure.py
kfold: wiki_really_all.txt
./kfold.py --seed=$(SEED) $<
@dustalov
dustalov / nmpu.py
Last active January 2, 2018 15:54
Normalized Modified Purity in Python.
#!/usr/bin/env python
# This script computes the normalized modified purity and inverse purity
# as according to this paper: https://aclweb.org/anthology/P14-1097.
# In fact, this program is currently quite a rough translation of
# the evaluation-verb-classes.perl script provided by Daisuke Kawahara.
import argparse
import re
import sys
@dustalov
dustalov / ztest.awk
Last active October 9, 2022 21:25
Pairwise statistical significance test in AWK using Z-test.
#!/usr/bin/awk -f
BEGIN {
# significance level
if (length(ALPHA) == 0) ALPHA = 0.05;
# standard error estimation method: "basic" or "pooled"
if (length(SE) == 0) SE = "basic";
# one-tailed or two-tailed?
if (TAILS != 2) TAILS = 1;
@dustalov
dustalov / ExtractRelations.java
Last active March 21, 2021 19:33
Extract semantic relations from Wiktionary using JWKTL.
import de.tudarmstadt.ukp.jwktl.JWKTL;
import de.tudarmstadt.ukp.jwktl.api.filter.WiktionaryEntryFilter;
import de.tudarmstadt.ukp.jwktl.api.util.Language;
import java.io.File;
import java.util.Locale;
public class ExtractRelations {
public static void main(String[] args) {
if (args.length != 1) {
System.err.println("Usage: java ExtractRelations.java database [filter]");
@dustalov
dustalov / decoder.sh
Created September 13, 2016 20:25
A brute force decoder of Cyrillic strings with unknown charset combination.
#!/bin/bash -e
S=$(head -1)
CHARSETS=(utf8 cp1251 cp1252 koi8r koi8u iso-8859-5 maccyrillic)
for c1 in ${CHARSETS[*]}; do
for c2 in ${CHARSETS[*]}; do
for c3 in ${CHARSETS[*]}; do
for c4 in ${CHARSETS[*]}; do
echo -ne "$c1\t$c2\t$c3\t$c4\t"
<<<$S iconv -f=$c1 -t=$c2 -c | iconv -f=$c3 -t=$c4 -c
done