y3nr1ng · May 19, 2016 17:51
diff --git a/genpkl b/genpkl
 #!/usr/bin/env python3

 import os, argparse, logging
 # using doc2vec model
 from gensim.models import Doc2Vec
 # generate compressed pickle file
 import pickle, numpy, gzip

 logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s')

 def extract_vec(model, sent_cnt, vec_dim, logger=None) :
    # ignore the first element, since it contains the test data
    total_sent_cnt = sum(sent_cnt[1:])

    vec_array = numpy.zeros((total_sent_cnt, vec_dim))
    vec_label = numpy.zeros(total_sent_cnt)

    curr_sent_idx = 0
    for emot, index_limit in enumerate(sent_cnt) :
        # ignore the first element in the counter list
        if emot == 0 :
            continue

        if logger :
            logger.info('... processing emoticon {:d}'.format(emot))

        for i in range(1,index_limit+1) :
            prefix = 'EMOTICON_{:d}_{:d}'.format(emot, i)
            vec_array[curr_sent_idx] = model.docvecs[prefix]
            vec_label[curr_sent_idx] = emot

            # increment the overall counter
            curr_sent_idx += 1

    return (vec_array, vec_label)

 def get_args() :
    parser = argparse.ArgumentParser(description='Generate compressed pickle file of doc vectors.')
    parser.add_argument('--outdir', '-o', dest='out_dir',
                        default='/tmp2/b03902036',
                        help='destination directory for the model file')
    parser.add_argument('--verbose', '-v', dest='verbose',
                        action='count', default=0,
                        help='control the display level of output logs')
    parser.add_argument('mod_file', nargs='+',
                        help='Model file from doc2vec training')

    return parser.parse_args()

 if __name__ == '__main__' :
    # parse the command line arguments
    args = get_args()
    # get the logger object
    logger = logging.getLogger()
    # set the log level
    if args.verbose >= 2 :
        logger.setLevel(logging.DEBUG)
    elif args.verbose >= 1 :
        logger.setLevel(logging.INFO)
    else :
        logger.setLevel(logging.WARNING)

    if len(args.mod_file) > 1 :
        logger.warning('additional model files are ignored except the first one')
    args.mod_file = args.mod_file[0]
    logger.info('loading model from "{:s}"'.format(args.mod_file))
    model = Doc2Vec.load(args.mod_file)

    logger.info('loading relevant data about the model')
    mif_base = os.path.splitext(args.mod_file)[0]
    with open(mif_base + '.mif', 'rb') as in_file :
        dat_file = pickle.load(in_file)
        sent_cnt = pickle.load(in_file)
        dim = pickle.load(in_file)
    logger.info('... model of {:d} features with {:d} emoticons is loaded'.format(dim, len(sent_cnt)-1))

    logger.info('extracting vectors from the model')
    feature, label = extract_vec(model, sent_cnt, dim, logger=logger)

    # create .pkl.gz file filepath
    basename = os.path.basename(args.mod_file)
    new_filename = os.path.splitext(basename)[0] + '.pkl.gz'
    new_filepath = os.path.join(args.out_dir, new_filename)

    # save the model
    with gzip.open(new_filepath, 'wb') as out_file :
        pickle.dump((feature, label), out_file, protocol=0)

    logger.info('numpy array saved to {:s}'.format(new_filepath))
diff --git a/nlp-dnn.sh b/nlp-dnn.sh
 #!/bin/bash

 # two variables you need to set
 pdnndir=/data/ASR5/babel/ymiao/tools/pdnn  # pointer to PDNN
 device=gpu0  # the device to be used. set it to "cpu" if you don't have GPUs

 # export environment variables
 export PYTHONPATH=$PYTHONPATH:$pdnndir
 export THEANO_FLAGS=mode=FAST_RUN,device=$device,floatX=float32

 # download mnist dataset
 wget http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

 # split the dataset to training, validation and testing sets
 # you will see train.pickle.gz, valid.pickle.gz, test.pickle.gz
 echo "Preparing datasets ..."
 python data_prep.py

 # train DNN model
 echo "Training the DNN model ..."
 python $pdnndir/cmds/run_DNN.py --train-data "train.pickle.gz" \
                                --valid-data "valid.pickle.gz" \
                                --nnet-spec "784:1024:1024:10" --wdir ./ \
                                --l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \
                                --param-output-file dnn.param --cfg-output-file dnn.cfg  >& dnn.training.log

 # classification on the testing data; -1 means the final layer, that is, the classification softmax layer
 echo "Classifying with the DNN model ..."
 python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \
                                          --nnet-param dnn.param --nnet-cfg dnn.cfg \
                                          --output-file "dnn.classify.pickle.gz" --layer-index -1 \
                                          --batch-size 100 >& dnn.testing.log

 python show_results.py dnn.classify.pickle.gz


 # train CNN model
 echo "Training the CNN model ..."
 python $pdnndir/cmds/run_CNN.py --train-data "train.pickle.gz" \
                                --valid-data "valid.pickle.gz" \
                                --conv-nnet-spec "1x28x28:20,5x5,p2x2:50,5x5,p2x2,f" --nnet-spec "512:10" --wdir ./ \
                                --l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \
                                --param-output-file cnn.param --cfg-output-file cnn.cfg  >& cnn.training.log

 echo "Classifying with the CNN model ..."
 python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \
                                          --nnet-param cnn.param --nnet-cfg cnn.cfg \
                                          --output-file "cnn.classify.pickle.gz" --layer-index -1 \
                                          --batch-size 100 >& cnn.testing.log

 python show_results.py cnn.classify.pickle.gz
	#!/usr/bin/env python3

	import os, argparse, logging
	# using doc2vec model
	from gensim.models import Doc2Vec
	# generate compressed pickle file
	import pickle, numpy, gzip

	logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s')

	def extract_vec(model, sent_cnt, vec_dim, logger=None) :
	# ignore the first element, since it contains the test data
	total_sent_cnt = sum(sent_cnt[1:])

	vec_array = numpy.zeros((total_sent_cnt, vec_dim))
	vec_label = numpy.zeros(total_sent_cnt)

	curr_sent_idx = 0
	for emot, index_limit in enumerate(sent_cnt) :
	# ignore the first element in the counter list
	if emot == 0 :
	continue

	if logger :
	logger.info('... processing emoticon {:d}'.format(emot))

	for i in range(1,index_limit+1) :
	prefix = 'EMOTICON_{:d}_{:d}'.format(emot, i)
	vec_array[curr_sent_idx] = model.docvecs[prefix]
	vec_label[curr_sent_idx] = emot

	# increment the overall counter
	curr_sent_idx += 1

	return (vec_array, vec_label)

	def get_args() :
	parser = argparse.ArgumentParser(description='Generate compressed pickle file of doc vectors.')
	parser.add_argument('--outdir', '-o', dest='out_dir',
	default='/tmp2/b03902036',
	help='destination directory for the model file')
	parser.add_argument('--verbose', '-v', dest='verbose',
	action='count', default=0,
	help='control the display level of output logs')
	parser.add_argument('mod_file', nargs='+',
	help='Model file from doc2vec training')

	return parser.parse_args()

	if __name__ == '__main__' :
	# parse the command line arguments
	args = get_args()
	# get the logger object
	logger = logging.getLogger()
	# set the log level
	if args.verbose >= 2 :
	logger.setLevel(logging.DEBUG)
	elif args.verbose >= 1 :
	logger.setLevel(logging.INFO)
	else :
	logger.setLevel(logging.WARNING)

	if len(args.mod_file) > 1 :
	logger.warning('additional model files are ignored except the first one')
	args.mod_file = args.mod_file[0]
	logger.info('loading model from "{:s}"'.format(args.mod_file))
	model = Doc2Vec.load(args.mod_file)

	logger.info('loading relevant data about the model')
	mif_base = os.path.splitext(args.mod_file)[0]
	with open(mif_base + '.mif', 'rb') as in_file :
	dat_file = pickle.load(in_file)
	sent_cnt = pickle.load(in_file)
	dim = pickle.load(in_file)
	logger.info('... model of {:d} features with {:d} emoticons is loaded'.format(dim, len(sent_cnt)-1))

	logger.info('extracting vectors from the model')
	feature, label = extract_vec(model, sent_cnt, dim, logger=logger)

	# create .pkl.gz file filepath
	basename = os.path.basename(args.mod_file)
	new_filename = os.path.splitext(basename)[0] + '.pkl.gz'
	new_filepath = os.path.join(args.out_dir, new_filename)

	# save the model
	with gzip.open(new_filepath, 'wb') as out_file :
	pickle.dump((feature, label), out_file, protocol=0)

	logger.info('numpy array saved to {:s}'.format(new_filepath))
	#!/bin/bash

	# two variables you need to set
	pdnndir=/data/ASR5/babel/ymiao/tools/pdnn # pointer to PDNN
	device=gpu0 # the device to be used. set it to "cpu" if you don't have GPUs

	# export environment variables
	export PYTHONPATH=$PYTHONPATH:$pdnndir
	export THEANO_FLAGS=mode=FAST_RUN,device=$device,floatX=float32

	# download mnist dataset
	wget http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

	# split the dataset to training, validation and testing sets
	# you will see train.pickle.gz, valid.pickle.gz, test.pickle.gz
	echo "Preparing datasets ..."
	python data_prep.py

	# train DNN model
	echo "Training the DNN model ..."
	python $pdnndir/cmds/run_DNN.py --train-data "train.pickle.gz" \
	--valid-data "valid.pickle.gz" \
	--nnet-spec "784:1024:1024:10" --wdir ./ \
	--l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \
	--param-output-file dnn.param --cfg-output-file dnn.cfg >& dnn.training.log

	# classification on the testing data; -1 means the final layer, that is, the classification softmax layer
	echo "Classifying with the DNN model ..."
	python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \
	--nnet-param dnn.param --nnet-cfg dnn.cfg \
	--output-file "dnn.classify.pickle.gz" --layer-index -1 \
	--batch-size 100 >& dnn.testing.log

	python show_results.py dnn.classify.pickle.gz


	# train CNN model
	echo "Training the CNN model ..."
	python $pdnndir/cmds/run_CNN.py --train-data "train.pickle.gz" \
	--valid-data "valid.pickle.gz" \
	--conv-nnet-spec "1x28x28:20,5x5,p2x2:50,5x5,p2x2,f" --nnet-spec "512:10" --wdir ./ \
	--l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \
	--param-output-file cnn.param --cfg-output-file cnn.cfg >& cnn.training.log

	echo "Classifying with the CNN model ..."
	python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \
	--nnet-param cnn.param --nnet-cfg cnn.cfg \
	--output-file "cnn.classify.pickle.gz" --layer-index -1 \
	--batch-size 100 >& cnn.testing.log

	python show_results.py cnn.classify.pickle.gz