Skip to content

Instantly share code, notes, and snippets.

@kowey
Created March 24, 2011 16:03
Show Gist options
  • Select an option

  • Save kowey/885309 to your computer and use it in GitHub Desktop.

Select an option

Save kowey/885309 to your computer and use it in GitHub Desktop.
#!/bin/bash
set -e -o pipefail
pushd `dirname $0` > /dev/null
SCRIPT_DIR=`pwd`
popd > /dev/null
ORIGINALS_DIR=data # treat as read-only!
REFERENCE_DIR=${ORIGINALS_DIR}/hand-extracted
WORK_DIR_SRC=${ORIGINALS_DIR}/document-pairs
BIN_DIR=cabal-dev/bin
CALCULATORS_DIR=../calculators
RASP_HOME=/opt/rasp/
PREPROCESSED_DIR=preprocessed
SENTENCE_SELECTED_DIR=sentence-selected
FRAGMENTS_DIR=fragments-extracted
PEERS_DIR=peers
SCORES_DIR=scores
DOCUMENT_LLR_1=doc-llr-1
DOCUMENT_LLR_2=doc-llr-2
SENTENCE_LLR=sent-llr
WORK_DIR=workspace
FRAGMENTS_DIR_BASELINE=fragments-baseline
FRAGMENTS_DIR_REFERENCE=fragments-reference
# ======================================================================
# HELPERS
# ======================================================================
mkdir_if_missing () {
if [ ! -e $1 ]; then
mkdir -p $1
fi
}
copy_from_original () {
REDO_ONE=$1
REDO_THREE=$2
mkdir_if_missing ${WORK_DIR}
BN=$(basename $REDO_ONE)
SRC_FILE=${WORK_DIR_SRC}/${BN%.*}/$BN
cp $SRC_FILE $REDO_THREE
}
directories_in () {
find $1 -mindepth 1 -maxdepth 1 -type d
}
prettify() {
SRC=$1
TGT=$2
redo-ifchange $SRC
mkdir_if_missing ${TGT}
for d in $(directories_in $SRC); do
td=$TGT/$(basename $d)
${BIN_DIR}/from-handwritten -h -t $d $td
cp fragment.css $td
done
find $TGT -name '*.html' | redo-stamp
}
# ======================================================================
# MAIN
# ======================================================================
case $1 in
llr-1)
redo-ifchange ${DOCUMENT_LLR_1}
;;
llr-2)
redo-ifchange ${DOCUMENT_LLR_2}
;;
sent-selection)
redo-ifchange ${SENTENCE_SELECTED_DIR}
;;
fragments)
redo-ifchange ${FRAGMENTS_DIR} ${FRAGMENTS_DIR}.ideal
;;
pretty)
mkdir_if_missing pretty
redo-ifchange pretty/${FRAGMENTS_DIR_REFERENCE} pretty/${FRAGMENTS_DIR} pretty/${FRAGMENTS_DIR}.ideal
;;
clean)
rm -rf ${WORK_DIR} ${PREPROCESSED_DIR} ${SENTENCE_SELECTED_DIR}
rm -rf ${SENTENCE_SELECTED_DIR}.{score,reference,subset}
;;
# ----------------------------------------------------------------------
# I. setup (after doc-alignment)
# ----------------------------------------------------------------------
${WORK_DIR})
mkdir_if_missing ${WORK_DIR}
# Annoyingly the corpus has different names for Hills DB-derived documents
# and for Wikipedia documents (because we have to align them). So to make
# life a bit simpler, let's assume document alignment is correct and
# give them all the same names
for i in `find $WORK_DIR_SRC/ -name *.wiki`; do
redo-ifchange $i
DI=`dirname $i`
# desired ("good") base name
GBN=$(basename $DI)
redo-ifchange $WORK_DIR/$GBN.wiki.chosen $WORK_DIR/$GBN.json $WORK_DIR/$GBN.geo-json
done
cat $WORK_DIR/*.{json,geo-json,wiki.chosen} | redo-stamp
;;
${WORK_DIR}/*.json)
copy_from_original $1 $3
;;
${WORK_DIR}/*.geo-json)
copy_from_original $1 $3
;;
${WORK_DIR}/*.wiki.chosen)
mkdir_if_missing ${WORK_DIR}
PBN=$(basename ${1%.*})
BN=${PBN%.*}
SRC_FILE=${WORK_DIR_SRC}/$BN/*.wiki # this only works if there is only 1
cp $SRC_FILE $3
;;
# ----------------------------------------------------------------------
# II. pre-processing
# ----------------------------------------------------------------------
*.nlg-json)
N=${1%.*}
redo-ifchange $N.json $N.geo-json
${BIN_DIR}/create-nlg-input $N.json -o $3
;;
${WORK_DIR}/*.en)
N=${1%.*}
redo-ifchange $N.wiki.chosen $N.nlg-json
# 1. sentence segmentation
${BIN_DIR}/take-sentences $N.wiki.chosen -o $N.wiki.segmented
# 2. tokenisation
# -
# hills db doesn't use accents in its recorded names for hills, so we translit them out
# while we're at it we also add spaces between punctuation
iconv -f utf-8 -t 'ascii//TRANSLIT' $N.wiki.segmented > $N.wiki.segmented-ascii
${BIN_DIR}/tokenise-sentences $N.wiki.segmented-ascii -o $N.wiki.tokenised
# 3. lemmatisation
sed -e 's/^\(..*\)$/^ \1/' < $N.wiki.tokenised > $N.wiki.pre-rasp
cat $N.wiki.pre-rasp |\
rasp_parse=cat rasp_sentence=cat ${RASP_HOME}/scripts/rasp.sh > $N.wiki.rasp
cat $N.wiki.rasp |\
sed -e 's/+[^_]*_/_/g' -e 's/^\^_\^ //' -e 's/^/[/' -e 's/$/]/' |\
tr [A-Z] [a-z] > $3
;;
${WORK_DIR}/*.data)
N=${1%.*}
redo-ifchange $N.nlg-json
${BIN_DIR}/flatten-data $N.nlg-json $3
;;
${PREPROCESSED_DIR})
redo-ifchange $WORK_DIR
mkdir_if_missing $PREPROCESSED_DIR
for i in $WORK_DIR/*.json; do
N=${i%.*}
BN=${N##*/}
redo-ifchange $N.data $N.en
cp $N.data $N.en $PREPROCESSED_DIR
done
cat $PREPROCESSED_DIR/*.data $PREPROCESSED_DIR/*.en | redo-stamp
;;
# ----------------------------------------------------------------------
# III. candidate sentence selection
# IV. token filtering
# ----------------------------------------------------------------------
${DOCUMENT_LLR_1})
redo-ifchange ${PREPROCESSED_DIR}
${BIN_DIR}/build-llr-lexicon ${PREPROCESSED_DIR} ${DOCUMENT_LLR_1}
cat ${DOCUMENT_LLR_1}/lexicon | redo-stamp
;;
${SENTENCE_SELECTED_DIR})
mkdir_if_missing $1
redo ${SENTENCE_SELECTED_DIR}/basic ${SENTENCE_SELECTED_DIR}/BASELINE-s1 ${SENTENCE_SELECTED_DIR}/BASELINE-s2
;;
${SENTENCE_SELECTED_DIR}/BASELINE-s1)
redo-ifchange ${PREPROCESSED_DIR}
${BIN_DIR}/create-baseline -s --cutoff 1 ${PREPROCESSED_DIR} $1
${BIN_DIR}/filter-data-tokens $1 ${DOCUMENT_LLR_1}/pos-probs ${WORK_DIR}
;;
${SENTENCE_SELECTED_DIR}/BASELINE-s2)
redo-ifchange ${PREPROCESSED_DIR}
${BIN_DIR}/create-baseline -s --cutoff 1 ${PREPROCESSED_DIR} $1
;;
${SENTENCE_SELECTED_DIR}/basic)
redo-ifchange ${WORK_DIR} ${PREPROCESSED_DIR}
redo-ifchange ${DOCUMENT_LLR_1}
SDIR=$1
${BIN_DIR}/filter-sentences ${PREPROCESSED_DIR} ${DOCUMENT_LLR_1}/pos-probs ${WORK_DIR}
${BIN_DIR}/filter-data-tokens ${WORK_DIR} ${DOCUMENT_LLR_1}/pos-probs ${WORK_DIR}
mkdir_if_missing $SDIR
for i in $WORK_DIR/*.en-selected; do
BN=$(basename ${i%.*})
cp ${WORK_DIR}/$BN.en-selected $SDIR/$BN.en
cp ${WORK_DIR}/$BN.data-selected $SDIR/$BN.data
done
cat $SDIR/*.en $SDIR/*.data | redo-stamp
# NOTE: as a sanity check, the sentence-selected dir should have
# the exact same structure as the preprocessed dir
;;
${DOCUMENT_LLR_2})
redo-ifchange ${SENTENCE_SELECTED_DIR}
${BIN_DIR}/build-llr-lexicon ${SENTENCE_SELECTED_DIR}/basic ${DOCUMENT_LLR_2}
cat ${DOCUMENT_LLR_2}/lexicon | redo-stamp
;;
# ----------------------------------------------------------------------
# V. fragment extraction
# ----------------------------------------------------------------------
${FRAGMENTS_DIR})
redo-ifchange ${SENTENCE_SELECTED_DIR}
redo-ifchange ${DOCUMENT_LLR_2}
mkdir_if_missing ${FRAGMENTS_DIR}
${BIN_DIR}/detect-fragments ${SENTENCE_SELECTED_DIR} ${DOCUMENT_LLR_2}/lexicon ${FRAGMENTS_DIR}
find ${FRAGMENTS_DIR} -name '*.en' -o -name '*.data' -exec cat {} \; | redo-stamp
;;
${FRAGMENTS_DIR}.ideal)
redo-ifchange ${SENTENCE_SELECTED_DIR}.reference
redo-ifchange ${DOCUMENT_LLR_2}
mkdir_if_missing ${FRAGMENTS_DIR}.ideal
${BIN_DIR}/detect-fragments ${SENTENCE_SELECTED_DIR}.reference ${DOCUMENT_LLR_2}/lexicon ${FRAGMENTS_DIR}.ideal
find ${FRAGMENTS_DIR}.ideal -name '*.en' -o -name '*.data' -exec cat {} \; | redo-stamp
;;
# ----------------------------------------------------------------------
# Visualisation
# ----------------------------------------------------------------------
pretty/${FRAGMENTS_DIR_REFERENCE})
redo-ifchange ${FRAGMENTS_DIR_REFERENCE}
${BIN_DIR}/from-handwritten -h ${REFERENCE_DIR} $1
cp fragment.css $1
cat $1/*.html $1/*.css | redo-stamp
;;
pretty/${FRAGMENTS_DIR})
prettify $(basename $1) $1
;;
pretty/${FRAGMENTS_DIR}.ideal)
prettify $(basename $1) $1
;;
# ----------------------------------------------------------------------
# Scoring
# ----------------------------------------------------------------------
${SENTENCE_SELECTED_DIR}.subset)
X_DIR=${1%.*}
redo-ifchange ${X_DIR}
mkdir_if_missing ${X_DIR}.subset
for sd in $(directories_in ${X_DIR}); do
sd2=${X_DIR}.subset/$(basename $sd)
mkdir_if_missing $sd2
for i in ${REFERENCE_DIR}/*.frag; do
BN=$(basename ${i%.*})
cp $sd/$BN.en $sd/$BN.data $sd2
done
done
cat ${X_DIR}.subset/*.{en,data} | redo-stamp
;;
${SENTENCE_SELECTED_DIR}.reference)
redo-ifchange ${PREPROCESSED_DIR}
${BIN_DIR}/from-handwritten ${REFERENCE_DIR} ${SENTENCE_SELECTED_DIR}.reference -c --sourcetext ${PREPROCESSED_DIR}
# We also assume the reference data subset
redo-ifchange ${FRAGMENTS_DIR_REFERENCE}
for i in ${FRAGMENTS_DIR_REFERENCE}/*.data-subset; do
BN=$(basename ${i%.*})
cp $i ${SENTENCE_SELECTED_DIR}.reference/$BN.data
done
cat ${SENTENCE_SELECTED_DIR}.reference/*.en ${SENTENCE_SELECTED_DIR}.reference/*.data | redo-stamp
;;
${FRAGMENTS_DIR_BASELINE})
redo-ifchange ${PREPROCESSED_DIR} ${SENTENCE_SELECTED_DIR}
TARGET=${1}
mkdir_if_missing ${TARGET}
${BIN_DIR}/create-baseline ${PREPROCESSED_DIR} ${TARGET}/BASELINE_A
${BIN_DIR}/create-baseline --cutoff 1 ${PREPROCESSED_DIR} ${TARGET}/BASELINE_f1
${BIN_DIR}/create-baseline --cutoff 2 ${PREPROCESSED_DIR} ${TARGET}/BASELINE_f2
${BIN_DIR}/create-baseline ${SENTENCE_SELECTED_DIR} ${TARGET}/ideal-BASELINE_A
${BIN_DIR}/create-baseline --cutoff 1 ${SENTENCE_SELECTED_DIR} ${TARGET}/ideal-BASELINE_f1
${BIN_DIR}/create-baseline --cutoff 2 ${SENTENCE_SELECTED_DIR} ${TARGET}/ideal-BASELINE_f2
find ${TARGET} -name '*fragments' -o -name '*.data-subset' | xargs cat | redo-stamp
;;
${FRAGMENTS_DIR_REFERENCE})
redo-ifchange ${PREPROCESSED_DIR}
mkdir_if_missing ${FRAGMENTS_DIR_REFERENCE}
${BIN_DIR}/from-handwritten ${REFERENCE_DIR} ${FRAGMENTS_DIR_REFERENCE} --sourcetext ${PREPROCESSED_DIR}
cat ${FRAGMENTS_DIR_REFERENCE}/*.{en-fragments,data-fragments,data-subset} | redo-stamp
;;
${SCORES_DIR}/*.sentence-selection)
redo-ifchange ${SENTENCE_SELECTED_DIR}.reference
redo-ifchange ${SENTENCE_SELECTED_DIR}.subset
${BIN_DIR}/score-sentence-selection -e .en ${SENTENCE_SELECTED_DIR}.reference ${SENTENCE_SELECTED_DIR}.subset/$(basename ${1%.*}) > $3
;;
${PEERS_DIR}.full)
TARGET=${PEERS_DIR}.full
redo-ifchange ${FRAGMENTS_DIR} ${FRAGMENTS_DIR}.ideal ${FRAGMENTS_DIR_BASELINE}
mkdir_if_missing ${TARGET}
# fragment extraction
for d in $(directories_in ${FRAGMENTS_DIR}) $(directories_in ${FRAGMENTS_DIR_BASELINE}); do
DBN=$(basename $d)
mkdir ${TARGET}/$DBN
cp -R $d/*.{en,data}-fragments ${TARGET}/$DBN
done
find ${TARGET} -name '*fragments' | xargs cat | redo-stamp
;;
${PEERS_DIR})
redo-ifchange ${FRAGMENTS_DIR} ${FRAGMENTS_DIR}.ideal ${FRAGMENTS_DIR_REFERENCE} ${FRAGMENTS_DIR_BASELINE}
FRAGLIST=$(for i in ${REFERENCE_DIR}/*.frag; do basename ${i%.*}; done)
rm -rf ${PEERS_DIR}
# reference
mkdir -p ${PEERS_DIR}/REFERENCE
for BN in $FRAGLIST; do
cp ${FRAGMENTS_DIR_REFERENCE}/$BN.* ${PEERS_DIR}/REFERENCE
done
# fragment extraction
for d in $(directories_in ${FRAGMENTS_DIR}) $(directories_in ${FRAGMENTS_DIR_BASELINE}); do
DBN=$(basename $d)
mkdir ${PEERS_DIR}/$DBN
for BN in $FRAGLIST; do
cp $d/$BN.* ${PEERS_DIR}/$DBN
done
done
# fragment extraction from ideal sentence selection
for d in $(directories_in ${FRAGMENTS_DIR}.ideal); do
DBN=ideal-$(basename $d)
mkdir ${PEERS_DIR}/$DBN
for BN in $FRAGLIST; do
cp $d/$BN.* ${PEERS_DIR}/$DBN
done
done
find ${PEERS_DIR} -name '*fragments' -o -name '*subset' | xargs cat | redo-stamp
;;
${SCORES_DIR})
mkdir_if_missing ${SCORES_DIR}
redo-ifchange ${PEERS_DIR}
mkdir_if_missing ${SCORES_DIR}/sentence-selection
sds=$(for sd in $(directories_in ${SENTENCE_SELECTED_DIR});\
do echo ${SCORES_DIR}/$(basename $sd).sentence-selection; done)
redo-ifchange $sds
${BIN_DIR}/dice-scores ${PEERS_DIR}/REFERENCE ${PEERS_DIR} ${SCORES_DIR}
chmod u+x ${CALCULATORS_DIR}/{bleu-scores,mteval-v*}.pl
perl -I ${CALCULATORS_DIR} ${CALCULATORS_DIR}/bleu-scores.pl ${PEERS_DIR}/REFERENCE ${PEERS_DIR} ${SCORES_DIR}
${BIN_DIR}/summarise-scores ${SCORES_DIR} > ${SCORES_DIR}/summary
cat ${SCORES_DIR}/{*.sent-selection,*.dice,*.bleu*} | redo-stamp
;;
# ----------------------------------------------------------------------
# etc
# ----------------------------------------------------------------------
*) echo "no rule to build '$1'" >&2; exit 1 ;;
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment