RicherMans · June 11, 2016 14:59
diff --git a/Feature extractor b/Feature extractor
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
 import argparse
 import errno
 import itertools
 import logging
 import multiprocessing as mp
 import os
 import re
 import shutil
 import subprocess

 import HTKTools
 import vad

 # HTKTOOLSPATH = '/slfs1/users/yl710/htk/HTKTools/'
 # ##
 # Attention:
 # All static variables which end with _DIR will be created by this script.
 # If you want to implement your personal dirs, etc. keep in mind that everything
 # ending with _DIR will be overwritten!
 # ##
 # The masked using to determinate which features will be used to perform normalization
 # This mask represents a normalization for every utterance
 MASK = '%%%%%%%%%*'

 QSUB = 'qsub'

 # Root dir is the current calling dir, but in any case if this script is called from outside,
 # keep the calling folder clean, by not outputting there the folders and files
 ROOT_DIR = os.path.join(os.path.dirname(__file__), '.')
 CURDIR = os.getcwd()

 VAD_PATH = os.path.join(ROOT_DIR, 'vad')

 CONFIG_DIR = os.path.join(CURDIR, 'cfgs')

 # HCOMPV = os.path.join(HTKTOOLSPATH, 'HCompV')
 # HCOPY = os.path.join(HTKTOOLSPATH, 'HCopy')
 # HEREST = os.path.join(HTKTOOLSPATH, 'HERest')
 # HHED = os.path.join(HTKTOOLSPATH, 'HHEd')
 # HPARSE = os.path.join(HTKTOOLSPATH, 'HParse')
 # HRESULTS = os.path.join(HTKTOOLSPATH, 'HResults')
 # HVITE = os.path.join(HTKTOOLSPATH, 'HVite')

 # VAD = os.path.join(VAD_PATH, 'vad')
 # VAD_GMM_CFG = os.path.join(VAD_PATH, 'gmm.cfg')
 # VAD_GMM_MMF = os.path.join(VAD_PATH, 'MMF')

 FEATURES_DIR = os.path.join(CURDIR, 'features')
 STATIC_DIR = os.path.join(FEATURES_DIR, 'static')
 DYNAMIC_DIR = os.path.join(FEATURES_DIR, 'concat')
 CMVN_DIR = os.path.join(FEATURES_DIR, 'cmvn')

 LOG_DIR = os.path.join(CURDIR, 'log')
 TMP_DIR = os.path.join(CURDIR, 'tmp')

 CMEANDIR = os.path.join(TMP_DIR, 'cmn')
 VARSCALEDIR = os.path.join(TMP_DIR, 'cvn')
 FLIST_DIR = os.path.join(CONFIG_DIR, 'flists')
 EDFILES_DIR = os.path.join(CONFIG_DIR, 'edfiles')

 MLF_HEADER = '#!MLF!#'

 NUMBER_JOBS = 4


 def runbatch(argsbatch, cwd=os.getcwd(), logfiles=None):
    '''
    Function: runbatch(argsbatch, cwd=os.getcwd
    Summary: Runs a given list of arguments in parallel on the Gridengine
    Examples: runbatch(*args)
    Attributes:
        @param ()):InsertHere
        @param (logfiles) default=None: InsertHere
    Returns: InsertHere
    '''
    if logfiles:
        assert(len(logfiles) == len(argsbatch))
    import tempfile
    processes = []
    scriptfiles = []
    for args, logfile in zip(argsbatch, logfiles):
        qsubcmd = [QSUB]
        qsubcmd.extend('-P cpu.p'.split())
        qsubcmd.append('-cwd')
        qsubcmd.extend('-N extractfeats'.split())
        qsubcmd.extend('-j y'.split())
        # We want to run the command with bash
        qsubcmd.extend('-S /bin/bash'.split())
        # we want to sync the processes so that the main process waits until it
        # finishes
        qsubcmd.extend('-sync y'.split())
        if logfile:
            qsubcmd.extend('-o {}'.format(logfile).split())
        scriptfile = tempfile.NamedTemporaryFile()
        scriptfile.write(" ".join(args))
        scriptfile.flush()
        qsubcmd.append(scriptfile.name)
        with open(os.devnull, 'w') as FNULL:
            processes.append(
                subprocess.Popen(qsubcmd, cwd=cwd, stdout=FNULL, stderr=subprocess.STDOUT))
        scriptfiles.append(scriptfile)
    for process, scriptfile in zip(processes, scriptfiles):
        process.wait()
        scriptfile.close()


 def runlocal(argsbatch, cwd=os.getcwd(), logfile=None):
    '''
    Runs the given batch files in parallel locally with NUMBER_JOBS Processes
    '''
    try:
        pool = mp.Pool()
        for arg in argsbatch:
            logging.debug(" ".join(arg))
        pool.map_async(universal_worker, pool_args(
            execute, argsbatch, cwd, logfile)).wait(9999)
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        raise KeyboardInterrupt


 def execute(args, cwd, log):
    try:
        if log:
            with open(log, 'w') as logp:
                subprocess.call(
                    args, stdout=logp, stderr=logp, cwd=cwd)
        else:
            subprocess.call(args, cwd=cwd)
    except KeyboardInterrupt:
        raise KeyboardInterrupt


 def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


 def create_dirs():
    dirstomake = {v for (k, v) in globals().items() if re.search('.+DIR', k)}
    map(mkdir_p, dirstomake)


 def cleanup():
    shutil.rmtree(CONFIG_DIR)
    shutil.rmtree(TMP_DIR)
    shutil.rmtree(STATIC_DIR)
    shutil.rmtree(DYNAMIC_DIR)


 def readDir(input_dir):
    '''
    Reads from the given Inputdir recursively down and returns all files in the directories.
    Be careful since there is not a check if the file is of a specific type!
    '''
    foundfiles = []
    for root, dirs, files in os.walk(input_dir):
        for f in files:
            if os.path.isfile(os.path.join(root, f)):
                foundfiles.append(os.path.abspath(os.path.join(root, f)))
    return foundfiles


 def generate_HCopy_script(files, output_features, output_script, featuretype):
    """
        Generate a script file for HCopy (mapping between 'audio' and feature files).
        files:     a list of all wav files
        outdir:     The output directory for the feature files
        output_script: Path of the output HCopy script.
        returns the output feature files
    """
    retfeatout = []
    with open(output_script, mode='w') as output:
        for f in files:
            fname = os.path.basename(os.path.splitext(f)[0])
            featureout = os.path.join(output_features, fname)
            # append the ending featuretype
            featureout = '{}.{}'.format(featureout, featuretype)
            retfeatout.append(featureout)
            output.write('{} {}'.format(f, featureout))
            output.write(os.linesep)
    return retfeatout


 def splitintochunks(l, num):
    a = []
    spl, ext = divmod(len(l), num)
    if ext:
        spl += 1
    for i in range(num):
        a.append(l[i * spl:(i + 1) * spl])
    return a


 def splitScp(scpfile, chunksize=None):
    '''
    Splits the given scp file and returns a list of the new paths
    '''
    scplines = open(scpfile, 'r').read().splitlines()
    chunks = []
    if not chunksize:
        chunks = splitintochunks(scplines, NUMBER_JOBS)
    else:
        chunks = splitintochunks(scplines, chunksize)
    tardir = os.path.abspath(os.path.dirname(scpfile))
    basenamescp, ext = os.path.splitext(os.path.basename(scpfile))
    newfilepaths = []
    for i in range(len(chunks)):
        newfilename = "%s%i%s" % (basenamescp, i, ext)
        newfullfilepath = os.path.join(tardir, newfilename)
        newfilepaths.append(newfullfilepath)
        with open(newfullfilepath, 'w') as newfilep:
            for chunk in chunks[i]:
                newfilep.write(chunk)
                newfilep.write(os.linesep)
    return newfilepaths


 def parallelHCopy(scppaths, configpath, runner):
    '''
    This is a helper method, since we use for the qsub -sync y
    '''
    argsbatch = []
    logfiles = []
    for scppath in scppaths:
        args = HTKTools.HCopy(configpath, scppath)
        stage = os.path.basename(os.path.splitext(scppath)[0])
        logfile = os.path.join(LOG_DIR, 'hcopy_{}.log'.format(stage))
        logfiles.append(logfile)
        argsbatch.append(args)
    runner(argsbatch, os.getcwd(), logfiles)


 def writeOutSplits(splits, outputdir, outputname):
    '''
    Writes out the splits into N different files and returns the list of files
    '''
    ret = []
    for i in range(len(splits)):
        newoutname = "%s_%i" % (outputname, i)
        outfile = os.path.join(outputdir, newoutname)
        ret.append(outfile)
        with open(outfile, 'w') as outpointer:
            outpointer.writelines(splits[i])
    return ret


 def HCopy(config, hcopy_scp):

    args = []
    args.append(HTKTools.HCOPY)  # HCopy binary
    args.extend(r'-T 1'.split())
    # Config file
    args.extend(r'-C {}'.format(os.path.join(config)).split())
    args.extend(r'-S {}'.format(hcopy_scp).split())  # Script file

    return args


 def generate_HCompV_script(input_dir, output_script, featuretype):
    """
        Generate a script file for HCompV (list of 'mfcc' files).
        input_dir:     Directory containing the audio files.
        output_script: Path of the output HCompV script.
    """
    with open(output_script, mode='w') as output:
        files = (f for f in sorted(os.listdir(input_dir))
                 if os.path.isfile(os.path.join(input_dir, f)))
        for f in files:
            feature = os.path.join(FEATURES_DIR, f.split('.')[0])
            output.write('{}.{}'.format(feature, featuretype))
            output.write(os.linesep)


 def concatenateSpeech(vadmlf, featuretype):
    '''
    Generates a file which consists of HCopy Commands to concatenate the speech segments and removing therefore the silenced segments
    vadmlf : the VAD.mlf which was generated using the VAD command
    '''
    commands = []
    logfiles = []
    with open(vadmlf, 'r') as lines:
        for line in lines:
            # remove newline
            line = line.rstrip(os.linesep)
            # features and labels are indicated as "*/XXXXXXXXXX.lab"
            if line.startswith("\""):
                # We know that the first 3 characters are "*/, so we remove
                # these and the last trailing "
                label = os.path.basename(line)
                # The label is know in the form of XXXXXXXXXXXX.lab
                curlabel, _ = label.split(".")
                # Remember which current label we got
                # Find all speech segments for the current label
                newline = lines.next()
                speechsegments = []
                while re.match("^[0-9]{1,}", newline):
                    begin, end, segmentflag = newline.split()
                    begin = long(begin) / 100000
                    end = long(end) / 100000
                    if segmentflag == 'speech':
                        speechsegments.append((begin, end))
                    newline = lines.next()
                # If we have non-silent speech
                if speechsegments:
                    log = os.path.join(LOG_DIR, 'concat.log')
                    cmd = []
                    cmd.append(HTKTools.HCOPY)
                    cmd.extend('-T 1'.split())
                    for i in range(len(speechsegments)):
                        begin, end = speechsegments[i]
                        cmd.append(
                            "{}.{}[{},{}]".format(curlabel, featuretype, begin + 1, end - 1))
                        if i + 1 < len(speechsegments):
                            cmd.extend("+")
                    featurename = curlabel + '.' + featuretype
                    newpath = os.path.abspath(
                        os.path.join(DYNAMIC_DIR, featurename))
                    # Return the newpaths to determine which files were
                    # processed
                    yield newpath
                    cmd.append(newpath)

                    commands.append(cmd)
                    logfiles.append(log)
                    with open(log, 'a') as logp:
                        logp.write("Executing : %s" % (" ".join(cmd)))
                        try:
                            subprocess.check_call(
                            cmd, cwd=STATIC_DIR, stdout=logp, stderr=logp)
                        except OSError as e:
                            print e.errno
                            print e.filename
                            print e.strerror
                            raise e



 def writeSimpleScp(files, outputpath):
    with open(outputpath, 'w') as scppointer:
        for f in files:
            scppointer.write(f)
            scppointer.write(os.linesep)


 def parallelVad(scpfile, outputmlf, runmode):
    # Generate one "base" .scp file which will then be split up
    scpsplits = splitScp(scpfile)
    outputmlfs = []
    argsbatch = []
    logfiles = []
    cwdpath = None
    for i in range(len(scpsplits)):
        vadcommand = vad.GMMVAD()
        outputMLF = os.path.abspath(
            os.path.join(TMP_DIR, 'vad_%i.mlf' % (i + 1)))
        logfile = os.path.abspath(
            os.path.join(LOG_DIR, 'vad_%i.log' % (i + 1)))
        outputmlfs.append(outputMLF)
        # Get the current arguments for the VAD
        args = vadcommand(scpsplits[i], outputMLF)
        argsbatch.append(args)
        logfiles.append(logfile)
        cwdpath = vadcommand.cwdPath
    runmode(argsbatch, cwdpath, logfiles)
    # Collect all the results, the only problem is the #!MLF!# header in each
    # file
    collectmlf = []
    for mlf in outputmlfs:
        # remove header
        collectmlf.extend(open(mlf, 'r').read().splitlines()[1:])

    with open(outputmlf, 'w') as outputmlfpointer:
        outputmlfpointer.write(MLF_HEADER)
        outputmlfpointer.write(os.linesep)
        outputmlfpointer.writelines(os.linesep.join(collectmlf))


 def universal_worker(input_pair):
    function, args = input_pair
    return function(*args)


 def pool_args(function, *args):
    args, cwd, logs = args
    # make sure we have enough args
    return zip(itertools.repeat(function), zip(args, itertools.repeat(cwd), logs))

 # def vad(scpfile, outputmlf):
 #     args = [VAD]
 #     vad_config_file=os.path.join(VAD_PATH,'cfg')
 #     args.extend('-C {}'.format(vad_config_file).split())
 #     args.extend('-scp {}'.format(scpfile).split())
 #     args.extend('-mlf {}'.format(outputmlf).split())
 #     return args
 #
 # def vad(scpfile, outputmlf):
 # global runner
 #     '''
 #     scpfile: The scp file which contains the utterances to do VAD
 #     outputmlf : Path to the file which will be generated after VAD
 #     returns the arguments which will be executed in the form of (args,cwd,logfile)
 #     '''
 # Generate the .scp file
 # scpfile = os.path.abspath(os.path.join(TMP_DIR, 'vad.scp'))
 # writeSimpleScp(wavfiles, scpfile)
 #     args = []
 # outputmlf = os.path.abspath(outputmlf)
 # WE need to use everywhere an absolute path , since we will run the .vad in
 # it's directory, meaning that every realtive link will fail
 #     args.append(VAD)
 #     args.append(scpfile)
 #     args.append(VAD_GMM_CFG)
 #     args.append(VAD_GMM_MMF)
 #     args.append(os.path.abspath(outputmlf))
    # For the VAD tool we need to run it in the given folder otherwise ./HList
    # will not be found, so we use cwd = "" to do so
 #     return args
    # runner(args, cwd=VAD_PATH, logfile=os.path.abspath(
    #     os.path.join(LOG_DIR, 'vad.log')))
    # with open(os.path.join(LOG_DIR, 'vad.log'), 'w') as log:
    #     subprocess.Popen(
    #         args, stdout=log, stderr=subprocess.STDOUT, cwd=VAD_PATH).wait()


 def readFeatureConfig(config):
    conf = {}
    for line in config:
        confline = re.findall(r"[\w\.']+", line)
        if confline:
            # Remove the trailing HPARM in the HPARAM
            if confline[0] == 'HPARM':
                confline = confline[1:]
            param, value = confline
            conf[param] = value
    return conf


 def calculate_target_dimension(config, targetkind):
    '''
    Calculates the target dimension given the _targetkind. We read in the _targetkind
    and parse out the relevant parameters "A","0","D","T"
    _targetkind : the resulting target feature, e.g.
    '''
    staticdimension = 0
    # Get the config for the size of cepstral compoentns
    if 'NUMCEPS' in config:
        staticdimension += int(config['NUMCEPS'])
    elif 'NUMCHANS' in config:
        staticdimension += int(config['NUMCHANS'])
    else:
        raise ValueError("Cant find ceps or chans in config!")
    feature = re.split('[_ -]', targetkind)
    # Use the UPPER case letters, for convienience
    feature = map(lambda x: x.upper(), feature)
    # First check if we also append C0 Energy, meaning that the static feature
    # has size +1
    if "0" in feature:
        staticdimension += 1
    if "E" in feature:
        staticdimension += 1
    targetdimension = staticdimension
    if "A" in feature:
        targetdimension += staticdimension
    if "D" in feature:
        targetdimension += staticdimension
    if "T" in feature:
        targetdimension += staticdimension
    return targetdimension


 def checkvalidfeatures(featureconfig, targetkind):
    '''
    Checks if the given features are valid, e.g. Statis features are PLP_0 and dynamic are PLP_0_D_A_Z
    Since no conversion between features are possible we simply check if the two feature types (PLP,PLP) are equal
    '''
    statickind = featureconfig['TARGETKIND']

    statictype = re.split('[_ -]', statickind)[0]
    targettype = re.split('[_ -]', targetkind)[0]
    # Raise an error if statictype is empry or targettype, meaning that there
    # was no match
    if not statictype or not targettype or not targettype.lower() == statictype.lower():
        raise ValueError('The specified Targetkind for the static features (%s) is not equal to the one for the dynamic features (%s)' % (
            statictype, targettype))


 def generate_cut_cmn_cvn_config(config, targetkind, targetdimension, featuredir):
    '''
    config is the read out feature config.
    This function does pare the current config and returns a tuple in the form of:
    (cut,cmn,cvn) files which are all paths

    '''

    globvar = os.path.join(CONFIG_DIR, 'globvar')
    cmnconfig = os.path.join(CONFIG_DIR, 'cmn.cfg')
    cvnconfig = os.path.join(CONFIG_DIR, 'cvn.cfg')
    cutconfig = os.path.join(CONFIG_DIR, 'cut.cfg')
    cmntargetkind = config['TARGETKIND']

    # First process cmn.cfg
    with open(cmnconfig, 'w') as cmnpointer:
        cmnpointer.write("TARGETKIND = {}".format(cmntargetkind))
        cmnpointer.write(os.linesep)
        cmnpointer.write("TRACE = {}".format(1))
        cmnpointer.write(os.linesep)
        cmnpointer.write("MAXTRYOPEN = {}".format(1))
    with open(cvnconfig, 'w') as cvnpointer:
        cvnpointer.write("TARGETKIND = {}".format(targetkind))
        cvnpointer.write(os.linesep)
        cvnpointer.write("TRACE = {}".format(1))
        cvnpointer.write(os.linesep)
        cvnpointer.write("MAXTRYOPEN = {}".format(1))
        cvnpointer.write(os.linesep)
        cvnpointer.write(
            "CMEANMASK = {}/{}".format(featuredir, MASK))
        cvnpointer.write(os.linesep)
        cvnpointer.write("CMEANDIR = {}".format(CMEANDIR))
    with open(cutconfig, 'w') as cutpointer:
        cutpointer.write("TARGETKIND = {}".format(targetkind))
        cutpointer.write(os.linesep)
        cutpointer.write("TRACE = {}".format(1))
        cutpointer.write(os.linesep)
        cutpointer.write("MAXTRYOPEN = {}".format(1))
        cutpointer.write(os.linesep)
        cutpointer.write(
            "CMEANMASK = {}/{}".format(featuredir, MASK))
        cutpointer.write(os.linesep)
        cutpointer.write("CMEANDIR = {}".format(CMEANDIR))
        cutpointer.write(os.linesep)
        cutpointer.write(
            "VARSCALEMASK = {}/{}".format(featuredir, MASK))
        cutpointer.write(os.linesep)
        cutpointer.write("VARSCALEDIR = {}".format(VARSCALEDIR))
        cutpointer.write(os.linesep)
        cutpointer.write("VARSCALEFN = {}".format(globvar))
        cutpointer.write(os.linesep)
        # Remove the _K if it is given in the general config
        if 'SAVEWITHCRC' in config:
            cutpointer.write('SAVEWITHCRC = {}'.format(config['SAVEWITHCRC']))
            cutpointer.write(os.linesep)
    with open(globvar, 'w') as globpointer:
        globpointer.write("<VARSCALE> {}".format(targetdimension))
        globpointer.write(os.linesep)
        for i in range(targetdimension):
            globpointer.write("%.1f " % (1))
    return (cutconfig, cmnconfig, cvnconfig)


 def cmvn(cutcfg, cmncfg, cvncfg, concattedspeechfiles, featuredir, featuretype='plp', runner=runlocal):
    '''
    Function: cmvn
    Summary: Runs cepstral mean variance normalization
    Examples: cmvn('cut.cfg','cmn.cfg','cvn.cfg',['input.plp'],'features/static')
    Attributes:
        @param (cutcfg):Path to the cut config file
        @param (cmncfg):Path to the cmn config file
        @param (cvncfg):Path to the cvn config file
        @param (concattedspeechfiles):An iterator/list to the features after VAD or to the static features
        @param (featuredir):The dir in which the features can be found. In case of VAD pass the VAD dir, otherwise the static dir
        @param (featuretype) default='plp': The returned feature type
        @param (runner) default=runlocal:
    Returns:path to the resulting .scp file
    '''
    norm_script = os.path.abspath(
        os.path.join(TMP_DIR, '{}.scp'.format('norm')))
    data_scp = os.path.join(TMP_DIR, '{}.scp'.format('data'))
    # We do generate a list here, since we iterate nonthe less two times over
    # the concated speechfiles
    concat_speechfiles = list(concattedspeechfiles)
    # Write out the general data file
    writeSimpleScp(concat_speechfiles, data_scp)
    # And the HCopy file
    cmvnfeats = generate_HCopy_script(
        concat_speechfiles, CMVN_DIR, norm_script, featuretype)

    # Run first the cmn HCOMPV
    # The mask needs to be with ABSPATH otherwise it wouldn't work since
    # The given .scp file consists of absolute paths
    FEATUREMASK = os.path.join(featuredir, MASK)
    # Mean normalization

    scpsplits = splitScp(data_scp)
    mean_normalization_args = []
    variance_normalization_args = []
    logfiles_mean = []
    logfiles_var = []
    for scpsplit in scpsplits:
        mean_args = HTKTools.HCompV(scpsplit, cmncfg, mask=FEATUREMASK,
                                    clusterrequest='m', clusterdir=CMEANDIR)
        variance_args = HTKTools.HCompV(scpsplit, cvncfg, mask=FEATUREMASK,
                                        clusterrequest='v', clusterdir=VARSCALEDIR)
        mean_normalization_args.append(mean_args)
        variance_normalization_args.append(variance_args)

        trainname = os.path.splitext(os.path.basename(scpsplit))[0]
        log_mean = os.path.abspath(os.path.join(
            LOG_DIR, 'hcompv_cmn_{}.log'.format(trainname)))
        log_var = os.path.abspath(
            os.path.join(LOG_DIR, 'hcompv_cvn_{}.log'.format(trainname)))
        logfiles_mean.append(log_mean)
        logfiles_var.append(log_var)

    # first do mean normalization
    runner(mean_normalization_args, os.getcwd(), logfiles_mean)

    # variance normalization
    runner(variance_normalization_args, os.getcwd(), logfiles_var)

    # normsplits = splitScp(norm_script)
    # multiprocessHCopy(normsplits, cutcfg)

    normsplits = splitScp(norm_script)

    parallelHCopy(normsplits, cutcfg, runner)

    return cmvnfeats


 def extractstaticFeatures(wavfiles, configpath, featuretype, runner):
    '''
    Function: extractstaticFeatures
    Summary: Extracts static features using HTK's HCopy
    Examples: extractstaticFeatures(['out.wav'],'config.cfg','plp_0',somerunner)
    Attributes:
        @param (wavfiles):list of the wavfiles which will be processed
        @param (configpath):Configuration path for the HTK config
        @param (featuretype):The target feature type
        @param (runner):The runner ( either local or on a cluster)
    Returns: The scp splits ( a list ) of the output static features.
    '''

    hcopy_scp = os.path.join(TMP_DIR, '{}.scp'.format('static'))
    # Extracting the static features, getting the paths of the output files in
    # return
    outputpaths = generate_HCopy_script(
        wavfiles, STATIC_DIR, hcopy_scp, featuretype)
    scpsplits = splitScp(hcopy_scp)
    # splitpaths = writeOutSplits(scpsplits, TMP_DIR, 'hcopy_scp')
    parallelHCopy(scpsplits, configpath, runner)
    return outputpaths


 def audio_scp_type(value):
    '''
    This function is the type of an argparse parameter
    It is used to distinguish if the program should extract the static features
    or if that is already done.
    If the given parameter value ends with .scp, we will treat the file as an scp file
    thus running the whole feature extraction process
    '''

    if value.endswith('.scp'):
        return open(value, 'r').read().splitlines()
    else:
        return readDir(value)


 def extractFeatures(sourceAudio, targetkind, configpath, novad, vadmlf=None, runmode=runlocal):
    ### Step 0 - Setup the Working Directories ###
    logging.info('Setup the Working Directories')
    create_dirs()
    # config is a open file
    logging.info('Reading in Config ')
    with open(configpath, 'r') as configp:
        config = readFeatureConfig(configp)
    # Targetkind validation
    # Convert _targetkind to upper case, to make it conform with the HTK format
    targetkind = "".join(map(lambda x: x.upper(), targetkind))
    # replace all wrong possible delimitors
    targetkind = re.sub("[ -]", "_", targetkind)
    # get the feature ending for the static and dynamic features
    featuretype = targetkind.split('_')[0].lower()

    checkvalidfeatures(config, targetkind)
    targetdimension = calculate_target_dimension(config, targetkind)

    ### Step 1 - Extract the Static Features ###
    # Get the absolut path to the wave files
    wavfiles = map(os.path.abspath, sourceAudio)
    logging.info("Extracting static Feats ")
    # Store the otuput static features as concatenatedspeech in case we do not
    # do VAD
    concatenatedspeech = extractstaticFeatures(
        wavfiles, configpath, featuretype, runmode)

    ### Step 2 - VAD ###
    if novad:
        logging.info('Running VAD')
        if not vadmlf:
            vadscpfile = os.path.join(TMP_DIR, 'vad.scp')
            writeSimpleScp(wavfiles, vadscpfile)
            vadmlf = os.path.join(TMP_DIR, 'vad.mlf')
            parallelVad(vadscpfile, vadmlf, runmode)

        logging.info('Concatinating Frames')
        concatenatedspeech = concatenateSpeech(vadmlf, featuretype)

    ### Step 3 - CMVN ####
    featuredir = os.path.abspath(STATIC_DIR)
    if novad:
        featuredir = os.path.abspath(DYNAMIC_DIR)

    cut, cmn, cvn = generate_cut_cmn_cvn_config(
        config, targetkind, targetdimension, featuredir)
    logging.info('Running CMVN')

    ret = cmvn(cut, cmn, cvn, concatenatedspeech,
               featuredir, featuretype, runmode)

    logging.info('Done !')
    return ret


 class RunMode(object):

    modes = {
        'cluster': runbatch,
        'local': runlocal
    }

    @property
    def mode(self):
        return self.modes[self._mode]

    @mode.setter
    def setMode(self, mode):
        '''
        Mode is either 'local' or 'global'
        '''
        if mode not in self.modes:
            raise ValueError("Mode needs to be either 'local' or 'cluster' ")
        self._mode = mode

 startwith = {'concat', 'cmvn'}

 if __name__ == "__main__":
    """
        Feature extraction script
    """

    parser = argparse.ArgumentParser(description='Feature extraction using multiple processes on the machine'
                                     )

    parser.add_argument('sourceAudio',
                        type=audio_scp_type, help='The root directory of all .wav files or an already feature extracted .scp file')
    parser.add_argument(
        '-c', '--config', dest='config', type=str, help='HTK Config file for the feature exraction', required=True
    )

    parser.add_argument(
        '-t', '--targetkind', help='The Feature which will be created at least, e.g. PLP_0_D_A_Z', required=True, type=str)

    parser.add_argument('-d', '--debug', type=int,
                        help='Enable the progression messages. Default is %(default)s, smaller values represent a bigger output. E.g. Debug is 10', default=logging.INFO
                        )

    parser.add_argument('--clean', action='store_true',
                        help='Cleanup the generated files after processing'
                        )
    parser.add_argument('-nv', '--novad', default=True, action='store_false',
                        help="If this argument is given, we do not do any VAD")
    parser.add_argument(
        '--run', help='runs on either Cluster or locally the extraction job', default='local')
    parser.add_argument(
        '--vadmlf', type=str, help='If Vad was already done, mlf file can be provided, so that vad will not be run')

    args = parser.parse_args()
    logging.basicConfig(
        level=args.debug, format='%(asctime)s %(levelname)s %(message)s', datefmt='%d/%m %H:%M:%S')
    logging.debug("Found %i files in the given directory/file ",
                  len(args.sourceAudio))
    # Check the number of files/filenames. If we have overlapping filenames we
    # need to split it into different directories
    fnames = set(map(os.path.basename, args.sourceAudio))
    logging.debug("Overall we have %i unique filenames", len(fnames))

    if len(fnames) != len(args.sourceAudio):
        logging.warn("Problem detected. Number of unique frames (%i) is not equal to the utterance number (%i). Not all the features will be extracted. Please use sub sets of the given files!" % (
            len(fnames), len(args.sourceAudio)))
    runner = RunMode()
    runner.setMode = args.run
    extractFeatures(args.sourceAudio, args.targetkind,
                    args.config, args.novad, args.vadmlf, runner.mode)
    if args.clean:
        cleanup()