Created
March 24, 2011 16:03
-
-
Save kowey/885309 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -e -o pipefail | |
| pushd `dirname $0` > /dev/null | |
| SCRIPT_DIR=`pwd` | |
| popd > /dev/null | |
| ORIGINALS_DIR=data # treat as read-only! | |
| REFERENCE_DIR=${ORIGINALS_DIR}/hand-extracted | |
| WORK_DIR_SRC=${ORIGINALS_DIR}/document-pairs | |
| BIN_DIR=cabal-dev/bin | |
| CALCULATORS_DIR=../calculators | |
| RASP_HOME=/opt/rasp/ | |
| PREPROCESSED_DIR=preprocessed | |
| SENTENCE_SELECTED_DIR=sentence-selected | |
| FRAGMENTS_DIR=fragments-extracted | |
| PEERS_DIR=peers | |
| SCORES_DIR=scores | |
| DOCUMENT_LLR_1=doc-llr-1 | |
| DOCUMENT_LLR_2=doc-llr-2 | |
| SENTENCE_LLR=sent-llr | |
| WORK_DIR=workspace | |
| FRAGMENTS_DIR_BASELINE=fragments-baseline | |
| FRAGMENTS_DIR_REFERENCE=fragments-reference | |
| # ====================================================================== | |
| # HELPERS | |
| # ====================================================================== | |
| mkdir_if_missing () { | |
| if [ ! -e $1 ]; then | |
| mkdir -p $1 | |
| fi | |
| } | |
| copy_from_original () { | |
| REDO_ONE=$1 | |
| REDO_THREE=$2 | |
| mkdir_if_missing ${WORK_DIR} | |
| BN=$(basename $REDO_ONE) | |
| SRC_FILE=${WORK_DIR_SRC}/${BN%.*}/$BN | |
| cp $SRC_FILE $REDO_THREE | |
| } | |
| directories_in () { | |
| find $1 -mindepth 1 -maxdepth 1 -type d | |
| } | |
| prettify() { | |
| SRC=$1 | |
| TGT=$2 | |
| redo-ifchange $SRC | |
| mkdir_if_missing ${TGT} | |
| for d in $(directories_in $SRC); do | |
| td=$TGT/$(basename $d) | |
| ${BIN_DIR}/from-handwritten -h -t $d $td | |
| cp fragment.css $td | |
| done | |
| find $TGT -name '*.html' | redo-stamp | |
| } | |
| # ====================================================================== | |
| # MAIN | |
| # ====================================================================== | |
| case $1 in | |
| llr-1) | |
| redo-ifchange ${DOCUMENT_LLR_1} | |
| ;; | |
| llr-2) | |
| redo-ifchange ${DOCUMENT_LLR_2} | |
| ;; | |
| sent-selection) | |
| redo-ifchange ${SENTENCE_SELECTED_DIR} | |
| ;; | |
| fragments) | |
| redo-ifchange ${FRAGMENTS_DIR} ${FRAGMENTS_DIR}.ideal | |
| ;; | |
| pretty) | |
| mkdir_if_missing pretty | |
| redo-ifchange pretty/${FRAGMENTS_DIR_REFERENCE} pretty/${FRAGMENTS_DIR} pretty/${FRAGMENTS_DIR}.ideal | |
| ;; | |
| clean) | |
| rm -rf ${WORK_DIR} ${PREPROCESSED_DIR} ${SENTENCE_SELECTED_DIR} | |
| rm -rf ${SENTENCE_SELECTED_DIR}.{score,reference,subset} | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # I. setup (after doc-alignment) | |
| # ---------------------------------------------------------------------- | |
| ${WORK_DIR}) | |
| mkdir_if_missing ${WORK_DIR} | |
| # Annoyingly the corpus has different names for Hills DB-derived documents | |
| # and for Wikipedia documents (because we have to align them). So to make | |
| # life a bit simpler, let's assume document alignment is correct and | |
| # give them all the same names | |
| for i in `find $WORK_DIR_SRC/ -name *.wiki`; do | |
| redo-ifchange $i | |
| DI=`dirname $i` | |
| # desired ("good") base name | |
| GBN=$(basename $DI) | |
| redo-ifchange $WORK_DIR/$GBN.wiki.chosen $WORK_DIR/$GBN.json $WORK_DIR/$GBN.geo-json | |
| done | |
| cat $WORK_DIR/*.{json,geo-json,wiki.chosen} | redo-stamp | |
| ;; | |
| ${WORK_DIR}/*.json) | |
| copy_from_original $1 $3 | |
| ;; | |
| ${WORK_DIR}/*.geo-json) | |
| copy_from_original $1 $3 | |
| ;; | |
| ${WORK_DIR}/*.wiki.chosen) | |
| mkdir_if_missing ${WORK_DIR} | |
| PBN=$(basename ${1%.*}) | |
| BN=${PBN%.*} | |
| SRC_FILE=${WORK_DIR_SRC}/$BN/*.wiki # this only works if there is only 1 | |
| cp $SRC_FILE $3 | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # II. pre-processing | |
| # ---------------------------------------------------------------------- | |
| *.nlg-json) | |
| N=${1%.*} | |
| redo-ifchange $N.json $N.geo-json | |
| ${BIN_DIR}/create-nlg-input $N.json -o $3 | |
| ;; | |
| ${WORK_DIR}/*.en) | |
| N=${1%.*} | |
| redo-ifchange $N.wiki.chosen $N.nlg-json | |
| # 1. sentence segmentation | |
| ${BIN_DIR}/take-sentences $N.wiki.chosen -o $N.wiki.segmented | |
| # 2. tokenisation | |
| # - | |
| # hills db doesn't use accents in its recorded names for hills, so we translit them out | |
| # while we're at it we also add spaces between punctuation | |
| iconv -f utf-8 -t 'ascii//TRANSLIT' $N.wiki.segmented > $N.wiki.segmented-ascii | |
| ${BIN_DIR}/tokenise-sentences $N.wiki.segmented-ascii -o $N.wiki.tokenised | |
| # 3. lemmatisation | |
| sed -e 's/^\(..*\)$/^ \1/' < $N.wiki.tokenised > $N.wiki.pre-rasp | |
| cat $N.wiki.pre-rasp |\ | |
| rasp_parse=cat rasp_sentence=cat ${RASP_HOME}/scripts/rasp.sh > $N.wiki.rasp | |
| cat $N.wiki.rasp |\ | |
| sed -e 's/+[^_]*_/_/g' -e 's/^\^_\^ //' -e 's/^/[/' -e 's/$/]/' |\ | |
| tr [A-Z] [a-z] > $3 | |
| ;; | |
| ${WORK_DIR}/*.data) | |
| N=${1%.*} | |
| redo-ifchange $N.nlg-json | |
| ${BIN_DIR}/flatten-data $N.nlg-json $3 | |
| ;; | |
| ${PREPROCESSED_DIR}) | |
| redo-ifchange $WORK_DIR | |
| mkdir_if_missing $PREPROCESSED_DIR | |
| for i in $WORK_DIR/*.json; do | |
| N=${i%.*} | |
| BN=${N##*/} | |
| redo-ifchange $N.data $N.en | |
| cp $N.data $N.en $PREPROCESSED_DIR | |
| done | |
| cat $PREPROCESSED_DIR/*.data $PREPROCESSED_DIR/*.en | redo-stamp | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # III. candidate sentence selection | |
| # IV. token filtering | |
| # ---------------------------------------------------------------------- | |
| ${DOCUMENT_LLR_1}) | |
| redo-ifchange ${PREPROCESSED_DIR} | |
| ${BIN_DIR}/build-llr-lexicon ${PREPROCESSED_DIR} ${DOCUMENT_LLR_1} | |
| cat ${DOCUMENT_LLR_1}/lexicon | redo-stamp | |
| ;; | |
| ${SENTENCE_SELECTED_DIR}) | |
| mkdir_if_missing $1 | |
| redo ${SENTENCE_SELECTED_DIR}/basic ${SENTENCE_SELECTED_DIR}/BASELINE-s1 ${SENTENCE_SELECTED_DIR}/BASELINE-s2 | |
| ;; | |
| ${SENTENCE_SELECTED_DIR}/BASELINE-s1) | |
| redo-ifchange ${PREPROCESSED_DIR} | |
| ${BIN_DIR}/create-baseline -s --cutoff 1 ${PREPROCESSED_DIR} $1 | |
| ${BIN_DIR}/filter-data-tokens $1 ${DOCUMENT_LLR_1}/pos-probs ${WORK_DIR} | |
| ;; | |
| ${SENTENCE_SELECTED_DIR}/BASELINE-s2) | |
| redo-ifchange ${PREPROCESSED_DIR} | |
| ${BIN_DIR}/create-baseline -s --cutoff 1 ${PREPROCESSED_DIR} $1 | |
| ;; | |
| ${SENTENCE_SELECTED_DIR}/basic) | |
| redo-ifchange ${WORK_DIR} ${PREPROCESSED_DIR} | |
| redo-ifchange ${DOCUMENT_LLR_1} | |
| SDIR=$1 | |
| ${BIN_DIR}/filter-sentences ${PREPROCESSED_DIR} ${DOCUMENT_LLR_1}/pos-probs ${WORK_DIR} | |
| ${BIN_DIR}/filter-data-tokens ${WORK_DIR} ${DOCUMENT_LLR_1}/pos-probs ${WORK_DIR} | |
| mkdir_if_missing $SDIR | |
| for i in $WORK_DIR/*.en-selected; do | |
| BN=$(basename ${i%.*}) | |
| cp ${WORK_DIR}/$BN.en-selected $SDIR/$BN.en | |
| cp ${WORK_DIR}/$BN.data-selected $SDIR/$BN.data | |
| done | |
| cat $SDIR/*.en $SDIR/*.data | redo-stamp | |
| # NOTE: as a sanity check, the sentence-selected dir should have | |
| # the exact same structure as the preprocessed dir | |
| ;; | |
| ${DOCUMENT_LLR_2}) | |
| redo-ifchange ${SENTENCE_SELECTED_DIR} | |
| ${BIN_DIR}/build-llr-lexicon ${SENTENCE_SELECTED_DIR}/basic ${DOCUMENT_LLR_2} | |
| cat ${DOCUMENT_LLR_2}/lexicon | redo-stamp | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # V. fragment extraction | |
| # ---------------------------------------------------------------------- | |
| ${FRAGMENTS_DIR}) | |
| redo-ifchange ${SENTENCE_SELECTED_DIR} | |
| redo-ifchange ${DOCUMENT_LLR_2} | |
| mkdir_if_missing ${FRAGMENTS_DIR} | |
| ${BIN_DIR}/detect-fragments ${SENTENCE_SELECTED_DIR} ${DOCUMENT_LLR_2}/lexicon ${FRAGMENTS_DIR} | |
| find ${FRAGMENTS_DIR} -name '*.en' -o -name '*.data' -exec cat {} \; | redo-stamp | |
| ;; | |
| ${FRAGMENTS_DIR}.ideal) | |
| redo-ifchange ${SENTENCE_SELECTED_DIR}.reference | |
| redo-ifchange ${DOCUMENT_LLR_2} | |
| mkdir_if_missing ${FRAGMENTS_DIR}.ideal | |
| ${BIN_DIR}/detect-fragments ${SENTENCE_SELECTED_DIR}.reference ${DOCUMENT_LLR_2}/lexicon ${FRAGMENTS_DIR}.ideal | |
| find ${FRAGMENTS_DIR}.ideal -name '*.en' -o -name '*.data' -exec cat {} \; | redo-stamp | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # Visualisation | |
| # ---------------------------------------------------------------------- | |
| pretty/${FRAGMENTS_DIR_REFERENCE}) | |
| redo-ifchange ${FRAGMENTS_DIR_REFERENCE} | |
| ${BIN_DIR}/from-handwritten -h ${REFERENCE_DIR} $1 | |
| cp fragment.css $1 | |
| cat $1/*.html $1/*.css | redo-stamp | |
| ;; | |
| pretty/${FRAGMENTS_DIR}) | |
| prettify $(basename $1) $1 | |
| ;; | |
| pretty/${FRAGMENTS_DIR}.ideal) | |
| prettify $(basename $1) $1 | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # Scoring | |
| # ---------------------------------------------------------------------- | |
| ${SENTENCE_SELECTED_DIR}.subset) | |
| X_DIR=${1%.*} | |
| redo-ifchange ${X_DIR} | |
| mkdir_if_missing ${X_DIR}.subset | |
| for sd in $(directories_in ${X_DIR}); do | |
| sd2=${X_DIR}.subset/$(basename $sd) | |
| mkdir_if_missing $sd2 | |
| for i in ${REFERENCE_DIR}/*.frag; do | |
| BN=$(basename ${i%.*}) | |
| cp $sd/$BN.en $sd/$BN.data $sd2 | |
| done | |
| done | |
| cat ${X_DIR}.subset/*.{en,data} | redo-stamp | |
| ;; | |
| ${SENTENCE_SELECTED_DIR}.reference) | |
| redo-ifchange ${PREPROCESSED_DIR} | |
| ${BIN_DIR}/from-handwritten ${REFERENCE_DIR} ${SENTENCE_SELECTED_DIR}.reference -c --sourcetext ${PREPROCESSED_DIR} | |
| # We also assume the reference data subset | |
| redo-ifchange ${FRAGMENTS_DIR_REFERENCE} | |
| for i in ${FRAGMENTS_DIR_REFERENCE}/*.data-subset; do | |
| BN=$(basename ${i%.*}) | |
| cp $i ${SENTENCE_SELECTED_DIR}.reference/$BN.data | |
| done | |
| cat ${SENTENCE_SELECTED_DIR}.reference/*.en ${SENTENCE_SELECTED_DIR}.reference/*.data | redo-stamp | |
| ;; | |
| ${FRAGMENTS_DIR_BASELINE}) | |
| redo-ifchange ${PREPROCESSED_DIR} ${SENTENCE_SELECTED_DIR} | |
| TARGET=${1} | |
| mkdir_if_missing ${TARGET} | |
| ${BIN_DIR}/create-baseline ${PREPROCESSED_DIR} ${TARGET}/BASELINE_A | |
| ${BIN_DIR}/create-baseline --cutoff 1 ${PREPROCESSED_DIR} ${TARGET}/BASELINE_f1 | |
| ${BIN_DIR}/create-baseline --cutoff 2 ${PREPROCESSED_DIR} ${TARGET}/BASELINE_f2 | |
| ${BIN_DIR}/create-baseline ${SENTENCE_SELECTED_DIR} ${TARGET}/ideal-BASELINE_A | |
| ${BIN_DIR}/create-baseline --cutoff 1 ${SENTENCE_SELECTED_DIR} ${TARGET}/ideal-BASELINE_f1 | |
| ${BIN_DIR}/create-baseline --cutoff 2 ${SENTENCE_SELECTED_DIR} ${TARGET}/ideal-BASELINE_f2 | |
| find ${TARGET} -name '*fragments' -o -name '*.data-subset' | xargs cat | redo-stamp | |
| ;; | |
| ${FRAGMENTS_DIR_REFERENCE}) | |
| redo-ifchange ${PREPROCESSED_DIR} | |
| mkdir_if_missing ${FRAGMENTS_DIR_REFERENCE} | |
| ${BIN_DIR}/from-handwritten ${REFERENCE_DIR} ${FRAGMENTS_DIR_REFERENCE} --sourcetext ${PREPROCESSED_DIR} | |
| cat ${FRAGMENTS_DIR_REFERENCE}/*.{en-fragments,data-fragments,data-subset} | redo-stamp | |
| ;; | |
| ${SCORES_DIR}/*.sentence-selection) | |
| redo-ifchange ${SENTENCE_SELECTED_DIR}.reference | |
| redo-ifchange ${SENTENCE_SELECTED_DIR}.subset | |
| ${BIN_DIR}/score-sentence-selection -e .en ${SENTENCE_SELECTED_DIR}.reference ${SENTENCE_SELECTED_DIR}.subset/$(basename ${1%.*}) > $3 | |
| ;; | |
| ${PEERS_DIR}.full) | |
| TARGET=${PEERS_DIR}.full | |
| redo-ifchange ${FRAGMENTS_DIR} ${FRAGMENTS_DIR}.ideal ${FRAGMENTS_DIR_BASELINE} | |
| mkdir_if_missing ${TARGET} | |
| # fragment extraction | |
| for d in $(directories_in ${FRAGMENTS_DIR}) $(directories_in ${FRAGMENTS_DIR_BASELINE}); do | |
| DBN=$(basename $d) | |
| mkdir ${TARGET}/$DBN | |
| cp -R $d/*.{en,data}-fragments ${TARGET}/$DBN | |
| done | |
| find ${TARGET} -name '*fragments' | xargs cat | redo-stamp | |
| ;; | |
| ${PEERS_DIR}) | |
| redo-ifchange ${FRAGMENTS_DIR} ${FRAGMENTS_DIR}.ideal ${FRAGMENTS_DIR_REFERENCE} ${FRAGMENTS_DIR_BASELINE} | |
| FRAGLIST=$(for i in ${REFERENCE_DIR}/*.frag; do basename ${i%.*}; done) | |
| rm -rf ${PEERS_DIR} | |
| # reference | |
| mkdir -p ${PEERS_DIR}/REFERENCE | |
| for BN in $FRAGLIST; do | |
| cp ${FRAGMENTS_DIR_REFERENCE}/$BN.* ${PEERS_DIR}/REFERENCE | |
| done | |
| # fragment extraction | |
| for d in $(directories_in ${FRAGMENTS_DIR}) $(directories_in ${FRAGMENTS_DIR_BASELINE}); do | |
| DBN=$(basename $d) | |
| mkdir ${PEERS_DIR}/$DBN | |
| for BN in $FRAGLIST; do | |
| cp $d/$BN.* ${PEERS_DIR}/$DBN | |
| done | |
| done | |
| # fragment extraction from ideal sentence selection | |
| for d in $(directories_in ${FRAGMENTS_DIR}.ideal); do | |
| DBN=ideal-$(basename $d) | |
| mkdir ${PEERS_DIR}/$DBN | |
| for BN in $FRAGLIST; do | |
| cp $d/$BN.* ${PEERS_DIR}/$DBN | |
| done | |
| done | |
| find ${PEERS_DIR} -name '*fragments' -o -name '*subset' | xargs cat | redo-stamp | |
| ;; | |
| ${SCORES_DIR}) | |
| mkdir_if_missing ${SCORES_DIR} | |
| redo-ifchange ${PEERS_DIR} | |
| mkdir_if_missing ${SCORES_DIR}/sentence-selection | |
| sds=$(for sd in $(directories_in ${SENTENCE_SELECTED_DIR});\ | |
| do echo ${SCORES_DIR}/$(basename $sd).sentence-selection; done) | |
| redo-ifchange $sds | |
| ${BIN_DIR}/dice-scores ${PEERS_DIR}/REFERENCE ${PEERS_DIR} ${SCORES_DIR} | |
| chmod u+x ${CALCULATORS_DIR}/{bleu-scores,mteval-v*}.pl | |
| perl -I ${CALCULATORS_DIR} ${CALCULATORS_DIR}/bleu-scores.pl ${PEERS_DIR}/REFERENCE ${PEERS_DIR} ${SCORES_DIR} | |
| ${BIN_DIR}/summarise-scores ${SCORES_DIR} > ${SCORES_DIR}/summary | |
| cat ${SCORES_DIR}/{*.sent-selection,*.dice,*.bleu*} | redo-stamp | |
| ;; | |
| # ---------------------------------------------------------------------- | |
| # etc | |
| # ---------------------------------------------------------------------- | |
| *) echo "no rule to build '$1'" >&2; exit 1 ;; | |
| esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment