Created
November 28, 2017 10:06
-
-
Save saisumit/c86b7565b3f18ed7928c1cd56708fb98 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import libraries | |
from __future__ import print_function | |
from __future__ import division | |
import numpy as np | |
import pandas as pd | |
import os | |
import re | |
import tensorflow as tf | |
import tflearn | |
from tflearn.layers.conv import conv_2d, max_pool_2d | |
from tflearn.layers.core import input_data, dropout, fully_connected | |
from tflearn.layers.estimator import regression | |
from tflearn.layers.normalization import local_response_normalization | |
import random | |
from timeit import default_timer as timer | |
import tsahelper as tsa | |
#--------------------------------------------------------------------------------------- | |
# Constants | |
# | |
# INPUT_FOLDER: The folder that contains the source data | |
# | |
# PREPROCESSED_DATA_FOLDER: The folder that contains preprocessed .npy files | |
# | |
# STAGE1_LABELS: The CSV file containing the labels by subject | |
# | |
# THREAT_ZONE: Threat Zone to train on (actual number not 0 based) | |
# | |
# BATCH_SIZE: Number of Subjects per batch | |
# | |
# EXAMPLES_PER_SUBJECT Number of examples generated per subject | |
# | |
# FILE_LIST: A list of the preprocessed .npy files to batch | |
# | |
# TRAIN_TEST_SPLIT_RATIO: Ratio to split the FILE_LIST between train and test | |
# | |
# TRAIN_SET_FILE_LIST: The list of .npy files to be used for training | |
# | |
# TEST_SET_FILE_LIST: The list of .npy files to be used for testing | |
# | |
# IMAGE_DIM: The height and width of the images in pixels | |
# | |
# LEARNING_RATE Learning rate for the neural network | |
# | |
# N_TRAIN_STEPS The number of train steps (epochs) to run | |
# | |
# TRAIN_PATH Place to store the tensorboard logs | |
# | |
# MODEL_PATH Path where model files are stored | |
# | |
# MODEL_NAME Name of the model files | |
# | |
#---------------------------------------------------------------------------------------- | |
INPUT_FOLDER = 'tsa_datasets/stage1/aps' | |
PREPROCESSED_DATA_FOLDER = 'tsa_datasets/preprocessed/' | |
STAGE1_LABELS = 'tsa_datasets/stage1_labels.csv' | |
THREAT_ZONE = 1 | |
BATCH_SIZE = 16 | |
EXAMPLES_PER_SUBJECT = 182 | |
FILE_LIST = [] | |
TRAIN_TEST_SPLIT_RATIO = 0.2 | |
TRAIN_SET_FILE_LIST = [] | |
TEST_SET_FILE_LIST = [] | |
IMAGE_DIM = 250 | |
LEARNING_RATE = 1e-3 | |
N_TRAIN_STEPS = 1 | |
TRAIN_PATH = 'tsa_logs/train/' | |
MODEL_PATH = 'tsa_logs/model/' | |
MODEL_NAME = ('tsa-{}-lr-{}-{}-{}-tz-{}'.format('alexnet-v0.1', LEARNING_RATE, IMAGE_DIM, | |
IMAGE_DIM, THREAT_ZONE )) | |
#--------------------------------------------------------------------------------------- | |
# preprocess_tsa_data(): preprocesses the tsa datasets | |
# | |
# parameters: none | |
# | |
# returns: none | |
#--------------------------------------------------------------------------------------- | |
def preprocess_tsa_data(): | |
# OPTION 1: get a list of all subjects for which there are labels | |
#df = pd.read_csv(STAGE1_LABELS) | |
#df['Subject'], df['Zone'] = df['Id'].str.split('_',1).str | |
#SUBJECT_LIST = df['Subject'].unique() | |
# OPTION 2: get a list of all subjects for whom there is data | |
#SUBJECT_LIST = [os.path.splitext(subject)[0] for subject in os.listdir(INPUT_FOLDER)] | |
# OPTION 3: get a list of subjects for small bore test purposes | |
SUBJECT_LIST = ['00360f79fd6e02781457eda48f85da90','0043db5e8c819bffc15261b1f1ac5e42', | |
'0050492f92e22eed3474ae3a6fc907fa','006ec59fa59dd80a64c85347eef810c7', | |
'0097503ee9fa0606559c56458b281a08','011516ab0eca7cad7f5257672ddde70e'] | |
# intialize tracking and saving items | |
batch_num = 1 | |
threat_zone_examples = [] | |
start_time = timer() | |
for subject in SUBJECT_LIST: | |
# read in the images | |
print('--------------------------------------------------------------') | |
print('t+> {:5.3f} |Reading images for subject #: {}'.format(timer()-start_time, | |
subject)) | |
print('--------------------------------------------------------------') | |
images = tsa.read_data(INPUT_FOLDER + '/' + subject + '.aps') | |
# transpose so that the slice is the first dimension shape(16, 620, 512) | |
images = images.transpose() | |
# for each threat zone, loop through each image, mask off the zone and then crop it | |
for tz_num, threat_zone_x_crop_dims in enumerate(zip(tsa.zone_slice_list, | |
tsa.zone_crop_list)): | |
threat_zone = threat_zone_x_crop_dims[0] | |
crop_dims = threat_zone_x_crop_dims[1] | |
# get label | |
label = np.array(tsa.get_subject_zone_label(tz_num, | |
tsa.get_subject_labels(STAGE1_LABELS, subject))) | |
for img_num, img in enumerate(images): | |
print('Threat Zone:Image -> {}:{}'.format(tz_num, img_num)) | |
print('Threat Zone Label -> {}'.format(label)) | |
if threat_zone[img_num] is not None: | |
# correct the orientation of the image | |
print('-> reorienting base image') | |
base_img = np.flipud(img) | |
print('-> shape {}|mean={}'.format(base_img.shape, | |
base_img.mean())) | |
# convert to grayscale | |
print('-> converting to grayscale') | |
rescaled_img = tsa.convert_to_grayscale(base_img) | |
print('-> shape {}|mean={}'.format(rescaled_img.shape, | |
rescaled_img.mean())) | |
# spread the spectrum to improve contrast | |
print('-> spreading spectrum') | |
high_contrast_img = tsa.spread_spectrum(rescaled_img) | |
print('-> shape {}|mean={}'.format(high_contrast_img.shape, | |
high_contrast_img.mean())) | |
# get the masked image | |
print('-> masking image') | |
masked_img = tsa.roi(high_contrast_img, threat_zone[img_num]) | |
print('-> shape {}|mean={}'.format(masked_img.shape, | |
masked_img.mean())) | |
# crop the image | |
print('-> cropping image') | |
cropped_img = tsa.crop(masked_img, crop_dims[img_num]) | |
print('-> shape {}|mean={}'.format(cropped_img.shape, | |
cropped_img.mean())) | |
# normalize the image | |
print('-> normalizing image') | |
normalized_img = tsa.normalize(cropped_img) | |
print('-> shape {}|mean={}'.format(normalized_img.shape, | |
normalized_img.mean())) | |
# zero center the image | |
print('-> zero centering') | |
zero_centered_img = tsa.zero_center(normalized_img) | |
print('-> shape {}|mean={}'.format(zero_centered_img.shape, | |
zero_centered_img.mean())) | |
# append the features and labels to this threat zone's example array | |
print ('-> appending example to threat zone {}'.format(tz_num)) | |
threat_zone_examples.append([[tz_num], zero_centered_img, label]) | |
print ('-> shape {:d}:{:d}:{:d}:{:d}:{:d}:{:d}'.format( | |
len(threat_zone_examples), | |
len(threat_zone_examples[0]), | |
len(threat_zone_examples[0][0]), | |
len(threat_zone_examples[0][1][0]), | |
len(threat_zone_examples[0][1][1]), | |
len(threat_zone_examples[0][2]))) | |
else: | |
print('-> No view of tz:{} in img:{}. Skipping to next...'.format( | |
tz_num, img_num)) | |
print('------------------------------------------------') | |
# each subject gets EXAMPLES_PER_SUBJECT number of examples (182 to be exact, | |
# so this section just writes out the the data once there is a full minibatch | |
# complete. | |
if ((len(threat_zone_examples) % (BATCH_SIZE * EXAMPLES_PER_SUBJECT)) == 0): | |
for tz_num, tz in enumerate(tsa.zone_slice_list): | |
tz_examples_to_save = [] | |
# write out the batch and reset | |
print(' -> writing: ' + PREPROCESSED_DATA_FOLDER + | |
'preprocessed_TSA_scans-tz{}-{}-{}-b{}.npy'.format( | |
tz_num+1, | |
len(threat_zone_examples[0][1][0]), | |
len(threat_zone_examples[0][1][1]), | |
batch_num)) | |
# get this tz's examples | |
tz_examples = [example for example in threat_zone_examples if example[0] == | |
[tz_num]] | |
# drop unused columns | |
tz_examples_to_save.append([[features_label[1], features_label[2]] | |
for features_label in tz_examples]) | |
# save batch. Note that the trainer looks for tz{} where {} is a | |
# tz_num 1 based in the minibatch file to select which batches to | |
# use for training a given threat zone | |
np.save(PREPROCESSED_DATA_FOLDER + | |
'preprocessed_TSA_scans-tz{}-{}-{}-b{}.npy'.format(tz_num+1, | |
len(threat_zone_examples[0][1][0]), | |
len(threat_zone_examples[0][1][1]), | |
batch_num), | |
tz_examples_to_save) | |
del tz_examples_to_save | |
#reset for next batch | |
del threat_zone_examples | |
threat_zone_examples = [] | |
batch_num += 1 | |
# we may run out of subjects before we finish a batch, so we write out | |
# the last batch stub | |
if (len(threat_zone_examples) > 0): | |
for tz_num, tz in enumerate(tsa.zone_slice_list): | |
tz_examples_to_save = [] | |
# write out the batch and reset | |
print(' -> writing: ' + PREPROCESSED_DATA_FOLDER | |
+ 'preprocessed_TSA_scans-tz{}-{}-{}-b{}.npy'.format(tz_num+1, | |
len(threat_zone_examples[0][1][0]), | |
len(threat_zone_examples[0][1][1]), | |
batch_num)) | |
# get this tz's examples | |
tz_examples = [example for example in threat_zone_examples if example[0] == | |
[tz_num]] | |
# drop unused columns | |
tz_examples_to_save.append([[features_label[1], features_label[2]] | |
for features_label in tz_examples]) | |
#save batch | |
np.save(PREPROCESSED_DATA_FOLDER + | |
'preprocessed_TSA_scans-tz{}-{}-{}-b{}.npy'.format(tz_num+1, | |
len(threat_zone_examples[0][1][0]), | |
len(threat_zone_examples[0][1][1]), | |
batch_num), | |
tz_examples_to_save) | |
# unit test --------------------------------------- | |
#preprocess_tsa_data() | |
#--------------------------------------------------------------------------------------- | |
# get_train_test_file_list(): gets the batch file list, splits between train and test | |
# | |
# parameters: none | |
# | |
# returns: none | |
# | |
#------------------------------------------------------------------------------------- | |
def get_train_test_file_list(): | |
global FILE_LIST | |
global TRAIN_SET_FILE_LIST | |
global TEST_SET_FILE_LIST | |
if os.listdir(PREPROCESSED_DATA_FOLDER) == []: | |
print ('No preprocessed data available. Skipping preprocessed data setup..') | |
else: | |
FILE_LIST = [f for f in os.listdir(PREPROCESSED_DATA_FOLDER) | |
if re.search(re.compile('-tz' + str(THREAT_ZONE) + '-'), f)] | |
train_test_split = len(FILE_LIST) - \ | |
max(int(len(FILE_LIST)*TRAIN_TEST_SPLIT_RATIO),1) | |
TRAIN_SET_FILE_LIST = FILE_LIST[:train_test_split] | |
TEST_SET_FILE_LIST = FILE_LIST[train_test_split:] | |
print('Train/Test Split -> {} file(s) of {} used for testing'.format( | |
len(FILE_LIST) - train_test_split, len(FILE_LIST))) | |
# unit test ---------------------------- | |
#get_train_test_file_list() | |
#--------------------------------------------------------------------------------------- | |
# input_pipeline(filename, path): prepares a batch of features and labels for training | |
# | |
# parameters: filename - the file to be batched into the model | |
# path - the folder where filename resides | |
# | |
# returns: feature_batch - a batch of features to train or test on | |
# label_batch - a batch of labels related to the feature_batch | |
# | |
#--------------------------------------------------------------------------------------- | |
def input_pipeline(filename, path): | |
preprocessed_tz_scans = [] | |
feature_batch = [] | |
label_batch = [] | |
#Load a batch of preprocessed tz scans | |
preprocessed_tz_scans = np.load(os.path.join(path, filename)) | |
#Shuffle to randomize for input into the model | |
np.random.shuffle(preprocessed_tz_scans) | |
# separate features and labels | |
for example_list in preprocessed_tz_scans: | |
for example in example_list: | |
feature_batch.append(example[0]) | |
label_batch.append(example[1]) | |
feature_batch = np.asarray(feature_batch, dtype=np.float32) | |
label_batch = np.asarray(label_batch, dtype=np.float32) | |
return feature_batch, label_batch | |
# unit test ------------------------------------------------------------------------ | |
#print ('Train Set -----------------------------') | |
#for f_in in TRAIN_SET_FILE_LIST: | |
# feature_batch, label_batch = input_pipeline(f_in, PREPROCESSED_DATA_FOLDER) | |
# print (' -> features shape {}:{}:{}'.format(len(feature_batch), | |
# len(feature_batch[0]), | |
# len(feature_batch[0][0]))) | |
# print (' -> labels shape {}:{}'.format(len(label_batch), len(label_batch[0]))) | |
#print ('Test Set -----------------------------') | |
#for f_in in TEST_SET_FILE_LIST: | |
# feature_batch, label_batch = input_pipeline(f_in, PREPROCESSED_DATA_FOLDER) | |
# print (' -> features shape {}:{}:{}'.format(len(feature_batch), | |
# len(feature_batch[0]), | |
# len(feature_batch[0][0]))) | |
# print (' -> labels shape {}:{}'.format(len(label_batch), len(label_batch[0]))) | |
#--------------------------------------------------------------------------------------- | |
# shuffle_train_set(): shuffle the list of batch files so that each train step | |
# receives them in a different order since the TRAIN_SET_FILE_LIST | |
# is a global | |
# | |
# parameters: train_set - the file listing to be shuffled | |
# | |
# returns: none | |
# | |
#------------------------------------------------------------------------------------- | |
def shuffle_train_set(train_set): | |
sorted_file_list = random.shuffle(train_set) | |
TRAIN_SET_FILE_LIST = sorted_file_list | |
# Unit test --------------- | |
#print ('Before Shuffling ->', TRAIN_SET_FILE_LIST) | |
#shuffle_train_set(TRAIN_SET_FILE_LIST) | |
#print ('After Shuffling ->', TRAIN_SET_FILE_LIST) | |
#--------------------------------------------------------------------------------------- | |
# alexnet(width, height, lr): defines the alexnet | |
# | |
# parameters: width - width of the input image | |
# height - height of the input image | |
# lr - learning rate | |
# | |
# returns: none | |
# | |
#------------------------------------------------------------------------------------- | |
def alexnet(width, height, lr): | |
network = input_data(shape=[None, width, height, 1], name='features') | |
network = conv_2d(network, 96, 11, strides=4, activation='relu') | |
network = max_pool_2d(network, 3, strides=2) | |
network = local_response_normalization(network) | |
network = conv_2d(network, 256, 5, activation='relu') | |
network = max_pool_2d(network, 3, strides=2) | |
network = local_response_normalization(network) | |
network = conv_2d(network, 384, 3, activation='relu') | |
network = conv_2d(network, 384, 3, activation='relu') | |
network = conv_2d(network, 256, 3, activation='relu') | |
network = max_pool_2d(network, 3, strides=2) | |
network = local_response_normalization(network) | |
network = fully_connected(network, 4096, activation='tanh') | |
network = dropout(network, 0.5) | |
network = fully_connected(network, 4096, activation='tanh') | |
network = dropout(network, 0.5) | |
network = fully_connected(network, 2, activation='softmax') | |
network = regression(network, optimizer='momentum', loss='categorical_crossentropy', | |
learning_rate=lr, name='labels') | |
model = tflearn.DNN(network, checkpoint_path=MODEL_PATH + MODEL_NAME, | |
tensorboard_dir=TRAIN_PATH, tensorboard_verbose=3, max_checkpoints=1) | |
return model | |
#--------------------------------------------------------------------------------------- | |
# train_conv_net(): runs the train op | |
# | |
# parameters: none | |
# | |
# returns: none | |
# | |
#------------------------------------------------------------------------------------- | |
def train_conv_net(): | |
val_features = [] | |
val_labels = [] | |
# get train and test batches | |
get_train_test_file_list() | |
# instantiate model | |
model = alexnet(IMAGE_DIM, IMAGE_DIM, LEARNING_RATE) | |
# read in the validation test set | |
for j, test_f_in in enumerate(TEST_SET_FILE_LIST): | |
if j == 0: | |
val_features, val_labels = input_pipeline(test_f_in, PREPROCESSED_DATA_FOLDER) | |
else: | |
tmp_feature_batch, tmp_label_batch = input_pipeline(test_f_in, | |
PREPROCESSED_DATA_FOLDER) | |
val_features = np.concatenate((tmp_feature_batch, val_features), axis=0) | |
val_labels = np.concatenate((tmp_label_batch, val_labels), axis=0) | |
val_features = val_features.reshape(-1, IMAGE_DIM, IMAGE_DIM, 1) | |
# start training process | |
for i in range(N_TRAIN_STEPS): | |
# shuffle the train set files before each step | |
shuffle_train_set(TRAIN_SET_FILE_LIST) | |
# run through every batch in the training set | |
for f_in in TRAIN_SET_FILE_LIST: | |
# read in a batch of features and labels for training | |
feature_batch, label_batch = input_pipeline(f_in, PREPROCESSED_DATA_FOLDER) | |
feature_batch = feature_batch.reshape(-1, IMAGE_DIM, IMAGE_DIM, 1) | |
#print ('Feature Batch Shape ->', feature_batch.shape) | |
# run the fit operation | |
model.fit({'features': feature_batch}, {'labels': label_batch}, n_epoch=1, | |
validation_set=({'features': val_features}, {'labels': val_labels}), | |
shuffle=True, snapshot_step=None, show_metric=True, | |
run_id=MODEL_NAME) | |
# unit test ----------------------------------- | |
#train_conv_net() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment