Created
April 9, 2014 07:16
-
-
Save AdolfVonKleist/10234978 to your computer and use it in GitHub Desktop.
phonetisaurus script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [ $# -ne 2 ] | |
then | |
echo "USAGE: $0 <lexicon> <order>" | |
echo " Recommended: order=6~9 for English" | |
exit | |
fi | |
#Triage the lexicon a bit to overcome my poor design | |
cat ${1} \ | |
| perl -e'while(<>){ | |
chomp; | |
@_ = split(/\s+/); | |
$w = shift(@_); | |
$p = join(" ",@_); | |
$p =~ s/_/,/g; | |
print $w."\t".$p."\n"; | |
}' \ | |
> ${1}.triaged.lex | |
#Run the aligner with some reasonable params (for English) | |
phonetisaurus-align --input=${1}.triaged.lex --ofile=${1}.triaged.corpus --seq1_del=false | |
echo "" | |
#Train an n-gram model | |
# Note: You can use *any* tool that outputs a valid ARPA-format LM | |
# Recommended: Interpolate a Kneser-Ney model with a MaxEnt model (latest SRILM), | |
# rescore with an RNNLM (if you want to get fancy) | |
# If possible results can be further improved by pruning pronunciations with | |
# a further forced alignment step with your recognizer | |
estimate-ngram -o ${2} -t ${1}.triaged.corpus -s FixModKN -wl ${1}.triaged.${2}g.arpa | |
echo "" | |
#Convert the model | |
phonetisaurus-arpa2wfst-omega --lm=${1}.triaged.${2}g.arpa --ofile=${1}.triaged.${2}g.fst | |
echo "" | |
#Test an input word, and fix the phoneme we triaged | |
phonetisaurus-g2p-omega --model=${1}.triaged.${2}g.fst --input=A-bomb \ | |
| perl -e'while(<>){ | |
chomp; | |
@_ = split(/\t/); | |
$_[2] =~ s/,/_/g; | |
print join("\t",@_)."\n"; | |
}' | |
echo "" | |
#Test a list of input words, get the 5-best, and fixe the phoneme we triaged | |
for w in A-frame A-line; do echo $w; done > short.wlist | |
phonetisaurus-g2p-omega --model=${1}.triaged.${2}g.fst --input=short.wlist --isfile=true \ | |
--nbest=5 --decoder_type=fst_phi \ | |
| perl -e'while(<>){ | |
chomp; | |
@_ = split(/\t/); | |
$_[2] =~ s/,/_/g; | |
print join("\t",@_)."\n"; | |
}' | |
echo "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Try running this with something like, with just a couple thousand lines at first:
$ ./little-test.sh sampa.spaced.lex 3
If it works, turn up the order to 7 or so.
Assuming the '_' is the only character-related issue, I think this should suffice
Also note that you only need to run the aligner step one time.
You can use the corpus after that directly, and just play around with LM training/tuning, which is typically really fast.