Last active
August 17, 2022 10:41
-
-
Save hankcs/46b9137016c769e4b6137104daf43a92 to your computer and use it in GitHub Desktop.
This script downloads and compiles the Ontonotes 2012 data into conll format. Modified from https://github.com/allenai/allennlp/blob/c4c532d25e012dbe6ab1ac14bca75e53e0acc621/scripts/compile_coref_data.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script downloads and compiles the Ontonotes 2012 data in a helpful format | |
# for co-reference resolution. It generates 3 files: {train, dev, test}.english.v4_gold_conll, | |
# as well as a directory 'conll-2012' which contains the raw extracted data. | |
# The script downloads and runs some python scripts which require python 2.X. | |
ONTONOTES_PATH=$1 | |
LANGUAGE=$2 | |
if [ ! -n "$ONTONOTES_PATH" ] ; then | |
echo "USAGE: ./scripts/compile_coref_data.sh /path/to/ontonotes english" | |
exit 1 | |
fi | |
function download_and_extract() { | |
wget $1/$2 | |
tar -xvzf $2 | |
rm $2 | |
} | |
function compile_partition() { | |
rm -f $2.$5.$3$4 | |
cat conll-2012/$3/data/$1/data/$5/annotations/*/*/*/*.$3$4 >> $2.$5.$3$4 | |
} | |
function compile_language() { | |
compile_partition development dev v4 _gold_conll $1 | |
compile_partition train train v4 _gold_conll $1 | |
compile_partition test test v4 _gold_conll $1 | |
} | |
conll_url=https://conll.cemantix.org/2012/download | |
download_and_extract $conll_url conll-2012-train.v4.tar.gz | |
download_and_extract $conll_url conll-2012-development.v4.tar.gz | |
download_and_extract $conll_url/test conll-2012-test-key.tar.gz | |
download_and_extract $conll_url/test conll-2012-test-official.v9.tar.gz | |
download_and_extract $conll_url conll-2012-scripts.v3.tar.gz | |
download_and_extract https://conll.cemantix.org/download reference-coreference-scorers.v8.01.tar.gz | |
rm -f conll-2012/scorer | |
mv reference-coreference-scorers conll-2012/scorer | |
# Convert the ontonotes data into the CONLL format. | |
wget https://gist.githubusercontent.com/hankcs/924c54179a884f46530ad1ab0e5522a9/raw/312a26118ad078df6c6b40676c1588993d5ee03f/skeleton2conll.sh | |
bash skeleton2conll.sh -D $ONTONOTES_PATH/data/files/data conll-2012 $LANGUAGE | |
rm -f skeleton2conll.sh | |
compile_language $LANGUAGE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment