Created
May 19, 2016 17:51
-
-
Save y3nr1ng/e98a79649213c3c60f71c93e572a5cb3 to your computer and use it in GitHub Desktop.
Task 2 - DNN
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os, argparse, logging | |
# using doc2vec model | |
from gensim.models import Doc2Vec | |
# generate compressed pickle file | |
import pickle, numpy, gzip | |
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s') | |
def extract_vec(model, sent_cnt, vec_dim, logger=None) : | |
# ignore the first element, since it contains the test data | |
total_sent_cnt = sum(sent_cnt[1:]) | |
vec_array = numpy.zeros((total_sent_cnt, vec_dim)) | |
vec_label = numpy.zeros(total_sent_cnt) | |
curr_sent_idx = 0 | |
for emot, index_limit in enumerate(sent_cnt) : | |
# ignore the first element in the counter list | |
if emot == 0 : | |
continue | |
if logger : | |
logger.info('... processing emoticon {:d}'.format(emot)) | |
for i in range(1,index_limit+1) : | |
prefix = 'EMOTICON_{:d}_{:d}'.format(emot, i) | |
vec_array[curr_sent_idx] = model.docvecs[prefix] | |
vec_label[curr_sent_idx] = emot | |
# increment the overall counter | |
curr_sent_idx += 1 | |
return (vec_array, vec_label) | |
def get_args() : | |
parser = argparse.ArgumentParser(description='Generate compressed pickle file of doc vectors.') | |
parser.add_argument('--outdir', '-o', dest='out_dir', | |
default='/tmp2/b03902036', | |
help='destination directory for the model file') | |
parser.add_argument('--verbose', '-v', dest='verbose', | |
action='count', default=0, | |
help='control the display level of output logs') | |
parser.add_argument('mod_file', nargs='+', | |
help='Model file from doc2vec training') | |
return parser.parse_args() | |
if __name__ == '__main__' : | |
# parse the command line arguments | |
args = get_args() | |
# get the logger object | |
logger = logging.getLogger() | |
# set the log level | |
if args.verbose >= 2 : | |
logger.setLevel(logging.DEBUG) | |
elif args.verbose >= 1 : | |
logger.setLevel(logging.INFO) | |
else : | |
logger.setLevel(logging.WARNING) | |
if len(args.mod_file) > 1 : | |
logger.warning('additional model files are ignored except the first one') | |
args.mod_file = args.mod_file[0] | |
logger.info('loading model from "{:s}"'.format(args.mod_file)) | |
model = Doc2Vec.load(args.mod_file) | |
logger.info('loading relevant data about the model') | |
mif_base = os.path.splitext(args.mod_file)[0] | |
with open(mif_base + '.mif', 'rb') as in_file : | |
dat_file = pickle.load(in_file) | |
sent_cnt = pickle.load(in_file) | |
dim = pickle.load(in_file) | |
logger.info('... model of {:d} features with {:d} emoticons is loaded'.format(dim, len(sent_cnt)-1)) | |
logger.info('extracting vectors from the model') | |
feature, label = extract_vec(model, sent_cnt, dim, logger=logger) | |
# create .pkl.gz file filepath | |
basename = os.path.basename(args.mod_file) | |
new_filename = os.path.splitext(basename)[0] + '.pkl.gz' | |
new_filepath = os.path.join(args.out_dir, new_filename) | |
# save the model | |
with gzip.open(new_filepath, 'wb') as out_file : | |
pickle.dump((feature, label), out_file, protocol=0) | |
logger.info('numpy array saved to {:s}'.format(new_filepath)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# two variables you need to set | |
pdnndir=/data/ASR5/babel/ymiao/tools/pdnn # pointer to PDNN | |
device=gpu0 # the device to be used. set it to "cpu" if you don't have GPUs | |
# export environment variables | |
export PYTHONPATH=$PYTHONPATH:$pdnndir | |
export THEANO_FLAGS=mode=FAST_RUN,device=$device,floatX=float32 | |
# download mnist dataset | |
wget http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz | |
# split the dataset to training, validation and testing sets | |
# you will see train.pickle.gz, valid.pickle.gz, test.pickle.gz | |
echo "Preparing datasets ..." | |
python data_prep.py | |
# train DNN model | |
echo "Training the DNN model ..." | |
python $pdnndir/cmds/run_DNN.py --train-data "train.pickle.gz" \ | |
--valid-data "valid.pickle.gz" \ | |
--nnet-spec "784:1024:1024:10" --wdir ./ \ | |
--l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \ | |
--param-output-file dnn.param --cfg-output-file dnn.cfg >& dnn.training.log | |
# classification on the testing data; -1 means the final layer, that is, the classification softmax layer | |
echo "Classifying with the DNN model ..." | |
python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \ | |
--nnet-param dnn.param --nnet-cfg dnn.cfg \ | |
--output-file "dnn.classify.pickle.gz" --layer-index -1 \ | |
--batch-size 100 >& dnn.testing.log | |
python show_results.py dnn.classify.pickle.gz | |
# train CNN model | |
echo "Training the CNN model ..." | |
python $pdnndir/cmds/run_CNN.py --train-data "train.pickle.gz" \ | |
--valid-data "valid.pickle.gz" \ | |
--conv-nnet-spec "1x28x28:20,5x5,p2x2:50,5x5,p2x2,f" --nnet-spec "512:10" --wdir ./ \ | |
--l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \ | |
--param-output-file cnn.param --cfg-output-file cnn.cfg >& cnn.training.log | |
echo "Classifying with the CNN model ..." | |
python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \ | |
--nnet-param cnn.param --nnet-cfg cnn.cfg \ | |
--output-file "cnn.classify.pickle.gz" --layer-index -1 \ | |
--batch-size 100 >& cnn.testing.log | |
python show_results.py cnn.classify.pickle.gz |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment