Created
June 11, 2016 14:59
-
-
Save RicherMans/8df9f02064905e158051fa96c600d20e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
import argparse | |
import errno | |
import itertools | |
import logging | |
import multiprocessing as mp | |
import os | |
import re | |
import shutil | |
import subprocess | |
import HTKTools | |
import vad | |
# HTKTOOLSPATH = '/slfs1/users/yl710/htk/HTKTools/' | |
# ## | |
# Attention: | |
# All static variables which end with _DIR will be created by this script. | |
# If you want to implement your personal dirs, etc. keep in mind that everything | |
# ending with _DIR will be overwritten! | |
# ## | |
# The masked using to determinate which features will be used to perform normalization | |
# This mask represents a normalization for every utterance | |
MASK = '%%%%%%%%%*' | |
QSUB = 'qsub' | |
# Root dir is the current calling dir, but in any case if this script is called from outside, | |
# keep the calling folder clean, by not outputting there the folders and files | |
ROOT_DIR = os.path.join(os.path.dirname(__file__), '.') | |
CURDIR = os.getcwd() | |
VAD_PATH = os.path.join(ROOT_DIR, 'vad') | |
CONFIG_DIR = os.path.join(CURDIR, 'cfgs') | |
# HCOMPV = os.path.join(HTKTOOLSPATH, 'HCompV') | |
# HCOPY = os.path.join(HTKTOOLSPATH, 'HCopy') | |
# HEREST = os.path.join(HTKTOOLSPATH, 'HERest') | |
# HHED = os.path.join(HTKTOOLSPATH, 'HHEd') | |
# HPARSE = os.path.join(HTKTOOLSPATH, 'HParse') | |
# HRESULTS = os.path.join(HTKTOOLSPATH, 'HResults') | |
# HVITE = os.path.join(HTKTOOLSPATH, 'HVite') | |
# VAD = os.path.join(VAD_PATH, 'vad') | |
# VAD_GMM_CFG = os.path.join(VAD_PATH, 'gmm.cfg') | |
# VAD_GMM_MMF = os.path.join(VAD_PATH, 'MMF') | |
FEATURES_DIR = os.path.join(CURDIR, 'features') | |
STATIC_DIR = os.path.join(FEATURES_DIR, 'static') | |
DYNAMIC_DIR = os.path.join(FEATURES_DIR, 'concat') | |
CMVN_DIR = os.path.join(FEATURES_DIR, 'cmvn') | |
LOG_DIR = os.path.join(CURDIR, 'log') | |
TMP_DIR = os.path.join(CURDIR, 'tmp') | |
CMEANDIR = os.path.join(TMP_DIR, 'cmn') | |
VARSCALEDIR = os.path.join(TMP_DIR, 'cvn') | |
FLIST_DIR = os.path.join(CONFIG_DIR, 'flists') | |
EDFILES_DIR = os.path.join(CONFIG_DIR, 'edfiles') | |
MLF_HEADER = '#!MLF!#' | |
NUMBER_JOBS = 4 | |
def runbatch(argsbatch, cwd=os.getcwd(), logfiles=None): | |
''' | |
Function: runbatch(argsbatch, cwd=os.getcwd | |
Summary: Runs a given list of arguments in parallel on the Gridengine | |
Examples: runbatch(*args) | |
Attributes: | |
@param ()):InsertHere | |
@param (logfiles) default=None: InsertHere | |
Returns: InsertHere | |
''' | |
if logfiles: | |
assert(len(logfiles) == len(argsbatch)) | |
import tempfile | |
processes = [] | |
scriptfiles = [] | |
for args, logfile in zip(argsbatch, logfiles): | |
qsubcmd = [QSUB] | |
qsubcmd.extend('-P cpu.p'.split()) | |
qsubcmd.append('-cwd') | |
qsubcmd.extend('-N extractfeats'.split()) | |
qsubcmd.extend('-j y'.split()) | |
# We want to run the command with bash | |
qsubcmd.extend('-S /bin/bash'.split()) | |
# we want to sync the processes so that the main process waits until it | |
# finishes | |
qsubcmd.extend('-sync y'.split()) | |
if logfile: | |
qsubcmd.extend('-o {}'.format(logfile).split()) | |
scriptfile = tempfile.NamedTemporaryFile() | |
scriptfile.write(" ".join(args)) | |
scriptfile.flush() | |
qsubcmd.append(scriptfile.name) | |
with open(os.devnull, 'w') as FNULL: | |
processes.append( | |
subprocess.Popen(qsubcmd, cwd=cwd, stdout=FNULL, stderr=subprocess.STDOUT)) | |
scriptfiles.append(scriptfile) | |
for process, scriptfile in zip(processes, scriptfiles): | |
process.wait() | |
scriptfile.close() | |
def runlocal(argsbatch, cwd=os.getcwd(), logfile=None): | |
''' | |
Runs the given batch files in parallel locally with NUMBER_JOBS Processes | |
''' | |
try: | |
pool = mp.Pool() | |
for arg in argsbatch: | |
logging.debug(" ".join(arg)) | |
pool.map_async(universal_worker, pool_args( | |
execute, argsbatch, cwd, logfile)).wait(9999) | |
pool.close() | |
pool.join() | |
except KeyboardInterrupt: | |
raise KeyboardInterrupt | |
def execute(args, cwd, log): | |
try: | |
if log: | |
with open(log, 'w') as logp: | |
subprocess.call( | |
args, stdout=logp, stderr=logp, cwd=cwd) | |
else: | |
subprocess.call(args, cwd=cwd) | |
except KeyboardInterrupt: | |
raise KeyboardInterrupt | |
def mkdir_p(path): | |
try: | |
os.makedirs(path) | |
except OSError as exc: | |
if exc.errno == errno.EEXIST and os.path.isdir(path): | |
pass | |
else: | |
raise | |
def create_dirs(): | |
dirstomake = {v for (k, v) in globals().items() if re.search('.+DIR', k)} | |
map(mkdir_p, dirstomake) | |
def cleanup(): | |
shutil.rmtree(CONFIG_DIR) | |
shutil.rmtree(TMP_DIR) | |
shutil.rmtree(STATIC_DIR) | |
shutil.rmtree(DYNAMIC_DIR) | |
def readDir(input_dir): | |
''' | |
Reads from the given Inputdir recursively down and returns all files in the directories. | |
Be careful since there is not a check if the file is of a specific type! | |
''' | |
foundfiles = [] | |
for root, dirs, files in os.walk(input_dir): | |
for f in files: | |
if os.path.isfile(os.path.join(root, f)): | |
foundfiles.append(os.path.abspath(os.path.join(root, f))) | |
return foundfiles | |
def generate_HCopy_script(files, output_features, output_script, featuretype): | |
""" | |
Generate a script file for HCopy (mapping between 'audio' and feature files). | |
files: a list of all wav files | |
outdir: The output directory for the feature files | |
output_script: Path of the output HCopy script. | |
returns the output feature files | |
""" | |
retfeatout = [] | |
with open(output_script, mode='w') as output: | |
for f in files: | |
fname = os.path.basename(os.path.splitext(f)[0]) | |
featureout = os.path.join(output_features, fname) | |
# append the ending featuretype | |
featureout = '{}.{}'.format(featureout, featuretype) | |
retfeatout.append(featureout) | |
output.write('{} {}'.format(f, featureout)) | |
output.write(os.linesep) | |
return retfeatout | |
def splitintochunks(l, num): | |
a = [] | |
spl, ext = divmod(len(l), num) | |
if ext: | |
spl += 1 | |
for i in range(num): | |
a.append(l[i * spl:(i + 1) * spl]) | |
return a | |
def splitScp(scpfile, chunksize=None): | |
''' | |
Splits the given scp file and returns a list of the new paths | |
''' | |
scplines = open(scpfile, 'r').read().splitlines() | |
chunks = [] | |
if not chunksize: | |
chunks = splitintochunks(scplines, NUMBER_JOBS) | |
else: | |
chunks = splitintochunks(scplines, chunksize) | |
tardir = os.path.abspath(os.path.dirname(scpfile)) | |
basenamescp, ext = os.path.splitext(os.path.basename(scpfile)) | |
newfilepaths = [] | |
for i in range(len(chunks)): | |
newfilename = "%s%i%s" % (basenamescp, i, ext) | |
newfullfilepath = os.path.join(tardir, newfilename) | |
newfilepaths.append(newfullfilepath) | |
with open(newfullfilepath, 'w') as newfilep: | |
for chunk in chunks[i]: | |
newfilep.write(chunk) | |
newfilep.write(os.linesep) | |
return newfilepaths | |
def parallelHCopy(scppaths, configpath, runner): | |
''' | |
This is a helper method, since we use for the qsub -sync y | |
''' | |
argsbatch = [] | |
logfiles = [] | |
for scppath in scppaths: | |
args = HTKTools.HCopy(configpath, scppath) | |
stage = os.path.basename(os.path.splitext(scppath)[0]) | |
logfile = os.path.join(LOG_DIR, 'hcopy_{}.log'.format(stage)) | |
logfiles.append(logfile) | |
argsbatch.append(args) | |
runner(argsbatch, os.getcwd(), logfiles) | |
def writeOutSplits(splits, outputdir, outputname): | |
''' | |
Writes out the splits into N different files and returns the list of files | |
''' | |
ret = [] | |
for i in range(len(splits)): | |
newoutname = "%s_%i" % (outputname, i) | |
outfile = os.path.join(outputdir, newoutname) | |
ret.append(outfile) | |
with open(outfile, 'w') as outpointer: | |
outpointer.writelines(splits[i]) | |
return ret | |
def HCopy(config, hcopy_scp): | |
args = [] | |
args.append(HTKTools.HCOPY) # HCopy binary | |
args.extend(r'-T 1'.split()) | |
# Config file | |
args.extend(r'-C {}'.format(os.path.join(config)).split()) | |
args.extend(r'-S {}'.format(hcopy_scp).split()) # Script file | |
return args | |
def generate_HCompV_script(input_dir, output_script, featuretype): | |
""" | |
Generate a script file for HCompV (list of 'mfcc' files). | |
input_dir: Directory containing the audio files. | |
output_script: Path of the output HCompV script. | |
""" | |
with open(output_script, mode='w') as output: | |
files = (f for f in sorted(os.listdir(input_dir)) | |
if os.path.isfile(os.path.join(input_dir, f))) | |
for f in files: | |
feature = os.path.join(FEATURES_DIR, f.split('.')[0]) | |
output.write('{}.{}'.format(feature, featuretype)) | |
output.write(os.linesep) | |
def concatenateSpeech(vadmlf, featuretype): | |
''' | |
Generates a file which consists of HCopy Commands to concatenate the speech segments and removing therefore the silenced segments | |
vadmlf : the VAD.mlf which was generated using the VAD command | |
''' | |
commands = [] | |
logfiles = [] | |
with open(vadmlf, 'r') as lines: | |
for line in lines: | |
# remove newline | |
line = line.rstrip(os.linesep) | |
# features and labels are indicated as "*/XXXXXXXXXX.lab" | |
if line.startswith("\""): | |
# We know that the first 3 characters are "*/, so we remove | |
# these and the last trailing " | |
label = os.path.basename(line) | |
# The label is know in the form of XXXXXXXXXXXX.lab | |
curlabel, _ = label.split(".") | |
# Remember which current label we got | |
# Find all speech segments for the current label | |
newline = lines.next() | |
speechsegments = [] | |
while re.match("^[0-9]{1,}", newline): | |
begin, end, segmentflag = newline.split() | |
begin = long(begin) / 100000 | |
end = long(end) / 100000 | |
if segmentflag == 'speech': | |
speechsegments.append((begin, end)) | |
newline = lines.next() | |
# If we have non-silent speech | |
if speechsegments: | |
log = os.path.join(LOG_DIR, 'concat.log') | |
cmd = [] | |
cmd.append(HTKTools.HCOPY) | |
cmd.extend('-T 1'.split()) | |
for i in range(len(speechsegments)): | |
begin, end = speechsegments[i] | |
cmd.append( | |
"{}.{}[{},{}]".format(curlabel, featuretype, begin + 1, end - 1)) | |
if i + 1 < len(speechsegments): | |
cmd.extend("+") | |
featurename = curlabel + '.' + featuretype | |
newpath = os.path.abspath( | |
os.path.join(DYNAMIC_DIR, featurename)) | |
# Return the newpaths to determine which files were | |
# processed | |
yield newpath | |
cmd.append(newpath) | |
commands.append(cmd) | |
logfiles.append(log) | |
with open(log, 'a') as logp: | |
logp.write("Executing : %s" % (" ".join(cmd))) | |
try: | |
subprocess.check_call( | |
cmd, cwd=STATIC_DIR, stdout=logp, stderr=logp) | |
except OSError as e: | |
print e.errno | |
print e.filename | |
print e.strerror | |
raise e | |
def writeSimpleScp(files, outputpath): | |
with open(outputpath, 'w') as scppointer: | |
for f in files: | |
scppointer.write(f) | |
scppointer.write(os.linesep) | |
def parallelVad(scpfile, outputmlf, runmode): | |
# Generate one "base" .scp file which will then be split up | |
scpsplits = splitScp(scpfile) | |
outputmlfs = [] | |
argsbatch = [] | |
logfiles = [] | |
cwdpath = None | |
for i in range(len(scpsplits)): | |
vadcommand = vad.GMMVAD() | |
outputMLF = os.path.abspath( | |
os.path.join(TMP_DIR, 'vad_%i.mlf' % (i + 1))) | |
logfile = os.path.abspath( | |
os.path.join(LOG_DIR, 'vad_%i.log' % (i + 1))) | |
outputmlfs.append(outputMLF) | |
# Get the current arguments for the VAD | |
args = vadcommand(scpsplits[i], outputMLF) | |
argsbatch.append(args) | |
logfiles.append(logfile) | |
cwdpath = vadcommand.cwdPath | |
runmode(argsbatch, cwdpath, logfiles) | |
# Collect all the results, the only problem is the #!MLF!# header in each | |
# file | |
collectmlf = [] | |
for mlf in outputmlfs: | |
# remove header | |
collectmlf.extend(open(mlf, 'r').read().splitlines()[1:]) | |
with open(outputmlf, 'w') as outputmlfpointer: | |
outputmlfpointer.write(MLF_HEADER) | |
outputmlfpointer.write(os.linesep) | |
outputmlfpointer.writelines(os.linesep.join(collectmlf)) | |
def universal_worker(input_pair): | |
function, args = input_pair | |
return function(*args) | |
def pool_args(function, *args): | |
args, cwd, logs = args | |
# make sure we have enough args | |
return zip(itertools.repeat(function), zip(args, itertools.repeat(cwd), logs)) | |
# def vad(scpfile, outputmlf): | |
# args = [VAD] | |
# vad_config_file=os.path.join(VAD_PATH,'cfg') | |
# args.extend('-C {}'.format(vad_config_file).split()) | |
# args.extend('-scp {}'.format(scpfile).split()) | |
# args.extend('-mlf {}'.format(outputmlf).split()) | |
# return args | |
# | |
# def vad(scpfile, outputmlf): | |
# global runner | |
# ''' | |
# scpfile: The scp file which contains the utterances to do VAD | |
# outputmlf : Path to the file which will be generated after VAD | |
# returns the arguments which will be executed in the form of (args,cwd,logfile) | |
# ''' | |
# Generate the .scp file | |
# scpfile = os.path.abspath(os.path.join(TMP_DIR, 'vad.scp')) | |
# writeSimpleScp(wavfiles, scpfile) | |
# args = [] | |
# outputmlf = os.path.abspath(outputmlf) | |
# WE need to use everywhere an absolute path , since we will run the .vad in | |
# it's directory, meaning that every realtive link will fail | |
# args.append(VAD) | |
# args.append(scpfile) | |
# args.append(VAD_GMM_CFG) | |
# args.append(VAD_GMM_MMF) | |
# args.append(os.path.abspath(outputmlf)) | |
# For the VAD tool we need to run it in the given folder otherwise ./HList | |
# will not be found, so we use cwd = "" to do so | |
# return args | |
# runner(args, cwd=VAD_PATH, logfile=os.path.abspath( | |
# os.path.join(LOG_DIR, 'vad.log'))) | |
# with open(os.path.join(LOG_DIR, 'vad.log'), 'w') as log: | |
# subprocess.Popen( | |
# args, stdout=log, stderr=subprocess.STDOUT, cwd=VAD_PATH).wait() | |
def readFeatureConfig(config): | |
conf = {} | |
for line in config: | |
confline = re.findall(r"[\w\.']+", line) | |
if confline: | |
# Remove the trailing HPARM in the HPARAM | |
if confline[0] == 'HPARM': | |
confline = confline[1:] | |
param, value = confline | |
conf[param] = value | |
return conf | |
def calculate_target_dimension(config, targetkind): | |
''' | |
Calculates the target dimension given the _targetkind. We read in the _targetkind | |
and parse out the relevant parameters "A","0","D","T" | |
_targetkind : the resulting target feature, e.g. | |
''' | |
staticdimension = 0 | |
# Get the config for the size of cepstral compoentns | |
if 'NUMCEPS' in config: | |
staticdimension += int(config['NUMCEPS']) | |
elif 'NUMCHANS' in config: | |
staticdimension += int(config['NUMCHANS']) | |
else: | |
raise ValueError("Cant find ceps or chans in config!") | |
feature = re.split('[_ -]', targetkind) | |
# Use the UPPER case letters, for convienience | |
feature = map(lambda x: x.upper(), feature) | |
# First check if we also append C0 Energy, meaning that the static feature | |
# has size +1 | |
if "0" in feature: | |
staticdimension += 1 | |
if "E" in feature: | |
staticdimension += 1 | |
targetdimension = staticdimension | |
if "A" in feature: | |
targetdimension += staticdimension | |
if "D" in feature: | |
targetdimension += staticdimension | |
if "T" in feature: | |
targetdimension += staticdimension | |
return targetdimension | |
def checkvalidfeatures(featureconfig, targetkind): | |
''' | |
Checks if the given features are valid, e.g. Statis features are PLP_0 and dynamic are PLP_0_D_A_Z | |
Since no conversion between features are possible we simply check if the two feature types (PLP,PLP) are equal | |
''' | |
statickind = featureconfig['TARGETKIND'] | |
statictype = re.split('[_ -]', statickind)[0] | |
targettype = re.split('[_ -]', targetkind)[0] | |
# Raise an error if statictype is empry or targettype, meaning that there | |
# was no match | |
if not statictype or not targettype or not targettype.lower() == statictype.lower(): | |
raise ValueError('The specified Targetkind for the static features (%s) is not equal to the one for the dynamic features (%s)' % ( | |
statictype, targettype)) | |
def generate_cut_cmn_cvn_config(config, targetkind, targetdimension, featuredir): | |
''' | |
config is the read out feature config. | |
This function does pare the current config and returns a tuple in the form of: | |
(cut,cmn,cvn) files which are all paths | |
''' | |
globvar = os.path.join(CONFIG_DIR, 'globvar') | |
cmnconfig = os.path.join(CONFIG_DIR, 'cmn.cfg') | |
cvnconfig = os.path.join(CONFIG_DIR, 'cvn.cfg') | |
cutconfig = os.path.join(CONFIG_DIR, 'cut.cfg') | |
cmntargetkind = config['TARGETKIND'] | |
# First process cmn.cfg | |
with open(cmnconfig, 'w') as cmnpointer: | |
cmnpointer.write("TARGETKIND = {}".format(cmntargetkind)) | |
cmnpointer.write(os.linesep) | |
cmnpointer.write("TRACE = {}".format(1)) | |
cmnpointer.write(os.linesep) | |
cmnpointer.write("MAXTRYOPEN = {}".format(1)) | |
with open(cvnconfig, 'w') as cvnpointer: | |
cvnpointer.write("TARGETKIND = {}".format(targetkind)) | |
cvnpointer.write(os.linesep) | |
cvnpointer.write("TRACE = {}".format(1)) | |
cvnpointer.write(os.linesep) | |
cvnpointer.write("MAXTRYOPEN = {}".format(1)) | |
cvnpointer.write(os.linesep) | |
cvnpointer.write( | |
"CMEANMASK = {}/{}".format(featuredir, MASK)) | |
cvnpointer.write(os.linesep) | |
cvnpointer.write("CMEANDIR = {}".format(CMEANDIR)) | |
with open(cutconfig, 'w') as cutpointer: | |
cutpointer.write("TARGETKIND = {}".format(targetkind)) | |
cutpointer.write(os.linesep) | |
cutpointer.write("TRACE = {}".format(1)) | |
cutpointer.write(os.linesep) | |
cutpointer.write("MAXTRYOPEN = {}".format(1)) | |
cutpointer.write(os.linesep) | |
cutpointer.write( | |
"CMEANMASK = {}/{}".format(featuredir, MASK)) | |
cutpointer.write(os.linesep) | |
cutpointer.write("CMEANDIR = {}".format(CMEANDIR)) | |
cutpointer.write(os.linesep) | |
cutpointer.write( | |
"VARSCALEMASK = {}/{}".format(featuredir, MASK)) | |
cutpointer.write(os.linesep) | |
cutpointer.write("VARSCALEDIR = {}".format(VARSCALEDIR)) | |
cutpointer.write(os.linesep) | |
cutpointer.write("VARSCALEFN = {}".format(globvar)) | |
cutpointer.write(os.linesep) | |
# Remove the _K if it is given in the general config | |
if 'SAVEWITHCRC' in config: | |
cutpointer.write('SAVEWITHCRC = {}'.format(config['SAVEWITHCRC'])) | |
cutpointer.write(os.linesep) | |
with open(globvar, 'w') as globpointer: | |
globpointer.write("<VARSCALE> {}".format(targetdimension)) | |
globpointer.write(os.linesep) | |
for i in range(targetdimension): | |
globpointer.write("%.1f " % (1)) | |
return (cutconfig, cmnconfig, cvnconfig) | |
def cmvn(cutcfg, cmncfg, cvncfg, concattedspeechfiles, featuredir, featuretype='plp', runner=runlocal): | |
''' | |
Function: cmvn | |
Summary: Runs cepstral mean variance normalization | |
Examples: cmvn('cut.cfg','cmn.cfg','cvn.cfg',['input.plp'],'features/static') | |
Attributes: | |
@param (cutcfg):Path to the cut config file | |
@param (cmncfg):Path to the cmn config file | |
@param (cvncfg):Path to the cvn config file | |
@param (concattedspeechfiles):An iterator/list to the features after VAD or to the static features | |
@param (featuredir):The dir in which the features can be found. In case of VAD pass the VAD dir, otherwise the static dir | |
@param (featuretype) default='plp': The returned feature type | |
@param (runner) default=runlocal: | |
Returns:path to the resulting .scp file | |
''' | |
norm_script = os.path.abspath( | |
os.path.join(TMP_DIR, '{}.scp'.format('norm'))) | |
data_scp = os.path.join(TMP_DIR, '{}.scp'.format('data')) | |
# We do generate a list here, since we iterate nonthe less two times over | |
# the concated speechfiles | |
concat_speechfiles = list(concattedspeechfiles) | |
# Write out the general data file | |
writeSimpleScp(concat_speechfiles, data_scp) | |
# And the HCopy file | |
cmvnfeats = generate_HCopy_script( | |
concat_speechfiles, CMVN_DIR, norm_script, featuretype) | |
# Run first the cmn HCOMPV | |
# The mask needs to be with ABSPATH otherwise it wouldn't work since | |
# The given .scp file consists of absolute paths | |
FEATUREMASK = os.path.join(featuredir, MASK) | |
# Mean normalization | |
scpsplits = splitScp(data_scp) | |
mean_normalization_args = [] | |
variance_normalization_args = [] | |
logfiles_mean = [] | |
logfiles_var = [] | |
for scpsplit in scpsplits: | |
mean_args = HTKTools.HCompV(scpsplit, cmncfg, mask=FEATUREMASK, | |
clusterrequest='m', clusterdir=CMEANDIR) | |
variance_args = HTKTools.HCompV(scpsplit, cvncfg, mask=FEATUREMASK, | |
clusterrequest='v', clusterdir=VARSCALEDIR) | |
mean_normalization_args.append(mean_args) | |
variance_normalization_args.append(variance_args) | |
trainname = os.path.splitext(os.path.basename(scpsplit))[0] | |
log_mean = os.path.abspath(os.path.join( | |
LOG_DIR, 'hcompv_cmn_{}.log'.format(trainname))) | |
log_var = os.path.abspath( | |
os.path.join(LOG_DIR, 'hcompv_cvn_{}.log'.format(trainname))) | |
logfiles_mean.append(log_mean) | |
logfiles_var.append(log_var) | |
# first do mean normalization | |
runner(mean_normalization_args, os.getcwd(), logfiles_mean) | |
# variance normalization | |
runner(variance_normalization_args, os.getcwd(), logfiles_var) | |
# normsplits = splitScp(norm_script) | |
# multiprocessHCopy(normsplits, cutcfg) | |
normsplits = splitScp(norm_script) | |
parallelHCopy(normsplits, cutcfg, runner) | |
return cmvnfeats | |
def extractstaticFeatures(wavfiles, configpath, featuretype, runner): | |
''' | |
Function: extractstaticFeatures | |
Summary: Extracts static features using HTK's HCopy | |
Examples: extractstaticFeatures(['out.wav'],'config.cfg','plp_0',somerunner) | |
Attributes: | |
@param (wavfiles):list of the wavfiles which will be processed | |
@param (configpath):Configuration path for the HTK config | |
@param (featuretype):The target feature type | |
@param (runner):The runner ( either local or on a cluster) | |
Returns: The scp splits ( a list ) of the output static features. | |
''' | |
hcopy_scp = os.path.join(TMP_DIR, '{}.scp'.format('static')) | |
# Extracting the static features, getting the paths of the output files in | |
# return | |
outputpaths = generate_HCopy_script( | |
wavfiles, STATIC_DIR, hcopy_scp, featuretype) | |
scpsplits = splitScp(hcopy_scp) | |
# splitpaths = writeOutSplits(scpsplits, TMP_DIR, 'hcopy_scp') | |
parallelHCopy(scpsplits, configpath, runner) | |
return outputpaths | |
def audio_scp_type(value): | |
''' | |
This function is the type of an argparse parameter | |
It is used to distinguish if the program should extract the static features | |
or if that is already done. | |
If the given parameter value ends with .scp, we will treat the file as an scp file | |
thus running the whole feature extraction process | |
''' | |
if value.endswith('.scp'): | |
return open(value, 'r').read().splitlines() | |
else: | |
return readDir(value) | |
def extractFeatures(sourceAudio, targetkind, configpath, novad, vadmlf=None, runmode=runlocal): | |
### Step 0 - Setup the Working Directories ### | |
logging.info('Setup the Working Directories') | |
create_dirs() | |
# config is a open file | |
logging.info('Reading in Config ') | |
with open(configpath, 'r') as configp: | |
config = readFeatureConfig(configp) | |
# Targetkind validation | |
# Convert _targetkind to upper case, to make it conform with the HTK format | |
targetkind = "".join(map(lambda x: x.upper(), targetkind)) | |
# replace all wrong possible delimitors | |
targetkind = re.sub("[ -]", "_", targetkind) | |
# get the feature ending for the static and dynamic features | |
featuretype = targetkind.split('_')[0].lower() | |
checkvalidfeatures(config, targetkind) | |
targetdimension = calculate_target_dimension(config, targetkind) | |
### Step 1 - Extract the Static Features ### | |
# Get the absolut path to the wave files | |
wavfiles = map(os.path.abspath, sourceAudio) | |
logging.info("Extracting static Feats ") | |
# Store the otuput static features as concatenatedspeech in case we do not | |
# do VAD | |
concatenatedspeech = extractstaticFeatures( | |
wavfiles, configpath, featuretype, runmode) | |
### Step 2 - VAD ### | |
if novad: | |
logging.info('Running VAD') | |
if not vadmlf: | |
vadscpfile = os.path.join(TMP_DIR, 'vad.scp') | |
writeSimpleScp(wavfiles, vadscpfile) | |
vadmlf = os.path.join(TMP_DIR, 'vad.mlf') | |
parallelVad(vadscpfile, vadmlf, runmode) | |
logging.info('Concatinating Frames') | |
concatenatedspeech = concatenateSpeech(vadmlf, featuretype) | |
### Step 3 - CMVN #### | |
featuredir = os.path.abspath(STATIC_DIR) | |
if novad: | |
featuredir = os.path.abspath(DYNAMIC_DIR) | |
cut, cmn, cvn = generate_cut_cmn_cvn_config( | |
config, targetkind, targetdimension, featuredir) | |
logging.info('Running CMVN') | |
ret = cmvn(cut, cmn, cvn, concatenatedspeech, | |
featuredir, featuretype, runmode) | |
logging.info('Done !') | |
return ret | |
class RunMode(object): | |
modes = { | |
'cluster': runbatch, | |
'local': runlocal | |
} | |
@property | |
def mode(self): | |
return self.modes[self._mode] | |
@mode.setter | |
def setMode(self, mode): | |
''' | |
Mode is either 'local' or 'global' | |
''' | |
if mode not in self.modes: | |
raise ValueError("Mode needs to be either 'local' or 'cluster' ") | |
self._mode = mode | |
startwith = {'concat', 'cmvn'} | |
if __name__ == "__main__": | |
""" | |
Feature extraction script | |
""" | |
parser = argparse.ArgumentParser(description='Feature extraction using multiple processes on the machine' | |
) | |
parser.add_argument('sourceAudio', | |
type=audio_scp_type, help='The root directory of all .wav files or an already feature extracted .scp file') | |
parser.add_argument( | |
'-c', '--config', dest='config', type=str, help='HTK Config file for the feature exraction', required=True | |
) | |
parser.add_argument( | |
'-t', '--targetkind', help='The Feature which will be created at least, e.g. PLP_0_D_A_Z', required=True, type=str) | |
parser.add_argument('-d', '--debug', type=int, | |
help='Enable the progression messages. Default is %(default)s, smaller values represent a bigger output. E.g. Debug is 10', default=logging.INFO | |
) | |
parser.add_argument('--clean', action='store_true', | |
help='Cleanup the generated files after processing' | |
) | |
parser.add_argument('-nv', '--novad', default=True, action='store_false', | |
help="If this argument is given, we do not do any VAD") | |
parser.add_argument( | |
'--run', help='runs on either Cluster or locally the extraction job', default='local') | |
parser.add_argument( | |
'--vadmlf', type=str, help='If Vad was already done, mlf file can be provided, so that vad will not be run') | |
args = parser.parse_args() | |
logging.basicConfig( | |
level=args.debug, format='%(asctime)s %(levelname)s %(message)s', datefmt='%d/%m %H:%M:%S') | |
logging.debug("Found %i files in the given directory/file ", | |
len(args.sourceAudio)) | |
# Check the number of files/filenames. If we have overlapping filenames we | |
# need to split it into different directories | |
fnames = set(map(os.path.basename, args.sourceAudio)) | |
logging.debug("Overall we have %i unique filenames", len(fnames)) | |
if len(fnames) != len(args.sourceAudio): | |
logging.warn("Problem detected. Number of unique frames (%i) is not equal to the utterance number (%i). Not all the features will be extracted. Please use sub sets of the given files!" % ( | |
len(fnames), len(args.sourceAudio))) | |
runner = RunMode() | |
runner.setMode = args.run | |
extractFeatures(args.sourceAudio, args.targetkind, | |
args.config, args.novad, args.vadmlf, runner.mode) | |
if args.clean: | |
cleanup() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment