Holzhaus · October 10, 2014 18:35
diff --git a/julius.py b/julius.py
 # You need to download the VoxForge Dict from:
 # http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Lexicon/VoxForge.tgz
 # and put it to this location:
 # /home/jan/Downloads/VoxForge.tgz

 import os
 import re
 import tempfile
 import subprocess
 import shutil
 import tarfile
 from contextlib import contextmanager
 from vocabcompiler import AbstractVocabulary, get_all_phrases
 from stt import AbstractSTTEngine


 class JuliusSTT(AbstractSTTEngine):

    SLUG = 'julius-stt'

    def __init__(self, vocabulary=None, hmmdefs="/usr/share/voxforge/julius/" +
                 "acoustic_model_files/hmmdefs", tiedlist="/usr/share/" +
                 "voxforge/julius/acoustic_model_files/tiedlist"):
        self._vocabulary = vocabulary
        self._hmmdefs = hmmdefs
        self._tiedlist = tiedlist
        self._pattern = re.compile(r'sentence1: <s> (.+) <s>')

    def transcribe(self, fp, mode=None):
        cmd = ['julius',
               '-quiet',
               '-nolog',
               '-input', 'stdin',
               '-dfa', self._vocabulary.dfa_file,
               '-v', self._vocabulary.dict_file,
               '-h', self._hmmdefs,
               '-hlist', self._tiedlist,
               '-forcedict']
        cmd = [str(x) for x in cmd]
        with tempfile.SpooledTemporaryFile() as out_f:
            with tempfile.SpooledTemporaryFile() as err_f:
                subprocess.call(cmd, stdin=fp, stdout=out_f, stderr=err_f)
            out_f.seek(0)
            matchobj = self._pattern.search(out_f.read())
            return matchobj.group(1) if matchobj else ""


 class JuliusVocabulary(AbstractVocabulary):

    PATH_PREFIX = 'julius-vocabulary'

    @property
    def dfa_file(self):
        """
        Returns:
            The path of the the julius dfa file as string
        """
        return os.path.join(self.path, 'dfa')

    @property
    def dict_file(self):
        """
        Returns:
            The path of the the julius dict file as string
        """
        return os.path.join(self.path, 'dict')

    @property
    def is_compiled(self):
        return (super(self.__class__, self).is_compiled and
                os.access(self.dfa_file, os.R_OK) and
                os.access(self.dict_file, os.R_OK))

    def _get_grammar(self, phrases):
        return {'S': [['NS_B', 'WORD_LOOP', 'NS_E']],
                'WORD_LOOP': [['WORD_LOOP', 'WORD'], ['WORD']]}

    def _get_word_defs(self, phrases):
        word_defs = {'NS_B': [('<s>', 'sil')],
                     'NS_E': [('<s>', 'sil')],
                     'WORD': []}

        words = []
        for phrase in phrases:
            if ' ' in phrase:
                for word in phrase.split(' '):
                    words.append(word)
            else:
                words.append(phrase)

        g2p = JuliusG2P('/home/jan/Downloads/VoxForge.tgz')
        for word in words:
            for phoneme in g2p.translate(word):
                word_defs['WORD'].append((word, phoneme))
        return word_defs

    def _compile_vocabulary(self, phrases):
        prefix = 'jasper'
        tmpdir = tempfile.mkdtemp()

        # Create grammar file
        tmp_grammar_file = os.path.join(tmpdir, os.extsep.join([prefix,
                                                                'grammar']))
        with open(tmp_grammar_file, 'w') as f:
            grammar = self._get_grammar(phrases)
            for definition in grammar.pop('S'):
                f.write("%s: %s\n" % ('S', ' '.join(definition)))
            for name, definitions in grammar.items():
                for definition in definitions:
                    f.write("%s: %s\n" % (name, ' '.join(definition)))

        # Create voca file
        tmp_voca_file = os.path.join(tmpdir, os.extsep.join([prefix, 'voca']))
        with open(tmp_voca_file, 'w') as f:
            for category, words in self._get_word_defs(phrases).items():
                f.write("%% %s\n" % category)
                for word, phoneme in words:
                    f.write("%s\t\t\t%s\n" % (word, phoneme))

        # mkdfa.pl
        olddir = os.getcwd()
        os.chdir(tmpdir)
        cmd = ['mkdfa.pl', str(prefix)]
        with tempfile.SpooledTemporaryFile() as out_f:
            with tempfile.SpooledTemporaryFile() as err_f:
                subprocess.call(cmd, stdout=out_f, stderr=err_f)
                err_f.seek(0)
                self._logger.debug(err_f.read().strip())
            out_f.seek(0)
            self._logger.debug(out_f.read().strip())
        os.chdir(olddir)

        tmp_dfa_file = os.path.join(tmpdir, os.extsep.join([prefix, 'dfa']))
        tmp_dict_file = os.path.join(tmpdir, os.extsep.join([prefix, 'dict']))
        shutil.move(tmp_dfa_file, self.dfa_file)
        shutil.move(tmp_dict_file, self.dict_file)

        shutil.rmtree(tmpdir)


 class JuliusG2P(object):
    def __init__(self, lexicon_file):
        self._lexicon_file = lexicon_file
        self._lexicon_data = self._parse_lexicon(self._lexicon_file)

    @contextmanager
    def _open_lexicon(self, fname):
        if tarfile.is_tarfile(fname):
            with tarfile.open(fname) as tf:
                lex = tf.getmember('VoxForge/VoxForgeDict')
                f = tf.extractfile(lex)
                yield f
                f.close()
        else:
            with open(self._lexicon_file, 'r') as f:
                yield f

    def _parse_lexicon(self, fname):
        data = {}
        with self._open_lexicon(fname) as f:
            for line in f:
                if line and '[' in line and ']' in line:
                    word, phoneme = line[line.index('[')+1:].split("]", 1)
                    word = word.strip()
                    phoneme = phoneme.strip()
                    phoneme = phoneme.replace('+', ' ').replace('-', ' ')
                    if word in data:
                        data[word].append(phoneme)
                    else:
                        data[word] = [phoneme]
        return data

    def translate(self, word):
        phonemes = []
        if word in self._lexicon_data:
            phonemes = self._lexicon_data[word]
        elif word.replace('-', ' ') in self._lexicon_data:
            phonemes = self._lexicon_data[word.replace('-', '')]
        return phonemes

 if __name__ == '__main__':
    import jasperpath
    phrases = get_all_phrases()
    vocab = JuliusVocabulary(path=tempfile.mkdtemp())
    print("Vocabulary in:     %s" % vocab.path)
    print("Revision file:     %s" % vocab.revision_file)
    print("Compiled revision: %s" % vocab.compiled_revision)
    print("Is compiled:       %r" % vocab.is_compiled)
    print("Matches phrases:   %r" % vocab.matches_phrases(phrases))
    if not vocab.is_compiled or not vocab.matches_phrases(phrases):
        print("Compiling...")
        vocab.compile(phrases)
        print("")
        print("Vocabulary in:     %s" % vocab.path)
        print("Revision file:     %s" % vocab.revision_file)
        print("Compiled revision: %s" % vocab.compiled_revision)
        print("Is compiled:       %r" % vocab.is_compiled)
        print("Matches phrases:   %r" % vocab.matches_phrases(phrases))
        print("")
        sttinst = JuliusSTT(vocabulary=vocab)
        with open(jasperpath.data('audio', 'time.wav'), mode="rb") as f:
            print sttinst.transcribe(f)
	# You need to download the VoxForge Dict from:
	# http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Lexicon/VoxForge.tgz
	# and put it to this location:
	# /home/jan/Downloads/VoxForge.tgz

	import os
	import re
	import tempfile
	import subprocess
	import shutil
	import tarfile
	from contextlib import contextmanager
	from vocabcompiler import AbstractVocabulary, get_all_phrases
	from stt import AbstractSTTEngine


	class JuliusSTT(AbstractSTTEngine):

	SLUG = 'julius-stt'

	def __init__(self, vocabulary=None, hmmdefs="/usr/share/voxforge/julius/" +
	"acoustic_model_files/hmmdefs", tiedlist="/usr/share/" +
	"voxforge/julius/acoustic_model_files/tiedlist"):
	self._vocabulary = vocabulary
	self._hmmdefs = hmmdefs
	self._tiedlist = tiedlist
	self._pattern = re.compile(r'sentence1: <s> (.+) <s>')

	def transcribe(self, fp, mode=None):
	cmd = ['julius',
	'-quiet',
	'-nolog',
	'-input', 'stdin',
	'-dfa', self._vocabulary.dfa_file,
	'-v', self._vocabulary.dict_file,
	'-h', self._hmmdefs,
	'-hlist', self._tiedlist,
	'-forcedict']
	cmd = [str(x) for x in cmd]
	with tempfile.SpooledTemporaryFile() as out_f:
	with tempfile.SpooledTemporaryFile() as err_f:
	subprocess.call(cmd, stdin=fp, stdout=out_f, stderr=err_f)
	out_f.seek(0)
	matchobj = self._pattern.search(out_f.read())
	return matchobj.group(1) if matchobj else ""


	class JuliusVocabulary(AbstractVocabulary):

	PATH_PREFIX = 'julius-vocabulary'

	@property
	def dfa_file(self):
	"""
	Returns:
	The path of the the julius dfa file as string
	"""
	return os.path.join(self.path, 'dfa')

	@property
	def dict_file(self):
	"""
	Returns:
	The path of the the julius dict file as string
	"""
	return os.path.join(self.path, 'dict')

	@property
	def is_compiled(self):
	return (super(self.__class__, self).is_compiled and
	os.access(self.dfa_file, os.R_OK) and
	os.access(self.dict_file, os.R_OK))

	def _get_grammar(self, phrases):
	return {'S': [['NS_B', 'WORD_LOOP', 'NS_E']],
	'WORD_LOOP': [['WORD_LOOP', 'WORD'], ['WORD']]}

	def _get_word_defs(self, phrases):
	word_defs = {'NS_B': [('<s>', 'sil')],
	'NS_E': [('<s>', 'sil')],
	'WORD': []}

	words = []
	for phrase in phrases:
	if ' ' in phrase:
	for word in phrase.split(' '):
	words.append(word)
	else:
	words.append(phrase)

	g2p = JuliusG2P('/home/jan/Downloads/VoxForge.tgz')
	for word in words:
	for phoneme in g2p.translate(word):
	word_defs['WORD'].append((word, phoneme))
	return word_defs

	def _compile_vocabulary(self, phrases):
	prefix = 'jasper'
	tmpdir = tempfile.mkdtemp()

	# Create grammar file
	tmp_grammar_file = os.path.join(tmpdir, os.extsep.join([prefix,
	'grammar']))
	with open(tmp_grammar_file, 'w') as f:
	grammar = self._get_grammar(phrases)
	for definition in grammar.pop('S'):
	f.write("%s: %s\n" % ('S', ' '.join(definition)))
	for name, definitions in grammar.items():
	for definition in definitions:
	f.write("%s: %s\n" % (name, ' '.join(definition)))

	# Create voca file
	tmp_voca_file = os.path.join(tmpdir, os.extsep.join([prefix, 'voca']))
	with open(tmp_voca_file, 'w') as f:
	for category, words in self._get_word_defs(phrases).items():
	f.write("%% %s\n" % category)
	for word, phoneme in words:
	f.write("%s\t\t\t%s\n" % (word, phoneme))

	# mkdfa.pl
	olddir = os.getcwd()
	os.chdir(tmpdir)
	cmd = ['mkdfa.pl', str(prefix)]
	with tempfile.SpooledTemporaryFile() as out_f:
	with tempfile.SpooledTemporaryFile() as err_f:
	subprocess.call(cmd, stdout=out_f, stderr=err_f)
	err_f.seek(0)
	self._logger.debug(err_f.read().strip())
	out_f.seek(0)
	self._logger.debug(out_f.read().strip())
	os.chdir(olddir)

	tmp_dfa_file = os.path.join(tmpdir, os.extsep.join([prefix, 'dfa']))
	tmp_dict_file = os.path.join(tmpdir, os.extsep.join([prefix, 'dict']))
	shutil.move(tmp_dfa_file, self.dfa_file)
	shutil.move(tmp_dict_file, self.dict_file)

	shutil.rmtree(tmpdir)


	class JuliusG2P(object):
	def __init__(self, lexicon_file):
	self._lexicon_file = lexicon_file
	self._lexicon_data = self._parse_lexicon(self._lexicon_file)

	@contextmanager
	def _open_lexicon(self, fname):
	if tarfile.is_tarfile(fname):
	with tarfile.open(fname) as tf:
	lex = tf.getmember('VoxForge/VoxForgeDict')
	f = tf.extractfile(lex)
	yield f
	f.close()
	else:
	with open(self._lexicon_file, 'r') as f:
	yield f

	def _parse_lexicon(self, fname):
	data = {}
	with self._open_lexicon(fname) as f:
	for line in f:
	if line and '[' in line and ']' in line:
	word, phoneme = line[line.index('[')+1:].split("]", 1)
	word = word.strip()
	phoneme = phoneme.strip()
	phoneme = phoneme.replace('+', ' ').replace('-', ' ')
	if word in data:
	data[word].append(phoneme)
	else:
	data[word] = [phoneme]
	return data

	def translate(self, word):
	phonemes = []
	if word in self._lexicon_data:
	phonemes = self._lexicon_data[word]
	elif word.replace('-', ' ') in self._lexicon_data:
	phonemes = self._lexicon_data[word.replace('-', '')]
	return phonemes

	if __name__ == '__main__':
	import jasperpath
	phrases = get_all_phrases()
	vocab = JuliusVocabulary(path=tempfile.mkdtemp())
	print("Vocabulary in: %s" % vocab.path)
	print("Revision file: %s" % vocab.revision_file)
	print("Compiled revision: %s" % vocab.compiled_revision)
	print("Is compiled: %r" % vocab.is_compiled)
	print("Matches phrases: %r" % vocab.matches_phrases(phrases))
	if not vocab.is_compiled or not vocab.matches_phrases(phrases):
	print("Compiling...")
	vocab.compile(phrases)
	print("")
	print("Vocabulary in: %s" % vocab.path)
	print("Revision file: %s" % vocab.revision_file)
	print("Compiled revision: %s" % vocab.compiled_revision)
	print("Is compiled: %r" % vocab.is_compiled)
	print("Matches phrases: %r" % vocab.matches_phrases(phrases))
	print("")
	sttinst = JuliusSTT(vocabulary=vocab)
	with open(jasperpath.data('audio', 'time.wav'), mode="rb") as f:
	print sttinst.transcribe(f)