Created
February 23, 2017 22:44
-
-
Save ragulpr/9c4eab7b1e48063f0797d0220d0f6f58 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_challenge_data(df,start_at,truncate_at): | |
seq_len = np.max([truncate_at,df[:,1].max().astype(int)+1]) | |
n_vars = df.shape[1]-2 # Drop unit_number and time | |
n_series = int(df[:,0].max()) | |
feature_data = np.zeros([seq_len,n_series,n_vars]) | |
times_to_event = np.zeros([seq_len,n_series,1]) | |
seq_lengths = np.zeros([n_series]) | |
mask = np.ones([seq_len,n_series,1]) | |
for s in xrange(n_series): | |
this_seq = df[np.floor(df[:,0])==s+1,2:] | |
this_seq_len = this_seq.shape[0] | |
feature_data[0:this_seq_len,s,:] = this_seq | |
seq_lengths[s] = this_seq_len | |
mask[this_seq_len:,s,0] = 0 | |
times_to_event[0:this_seq_len,s,0] = np.linspace(this_seq_len-1,0,this_seq_len) | |
seq_lengths = seq_lengths.astype(int) | |
feature_data = feature_data[start_at:truncate_at,:,:] | |
times_to_event = times_to_event[start_at:truncate_at,:,:] | |
seq_lengths[truncate_at<seq_lengths] = truncate_at | |
seq_lengths = seq_lengths - start_at | |
mask = mask[start_at:truncate_at,:,:] | |
n_series | |
n_vars | |
return feature_data,times_to_event,seq_lengths,mask,n_series,n_vars | |
def make_one_array(dflist): | |
for i in xrange(1,len(dflist)): | |
dflist[i][:,0] = dflist[i-1][:,0].max()+dflist[i][:,0] | |
return np.concatenate(dflist) | |
def get_normalization_coef(feature_data,mask): | |
reshaped_data = feature_data.reshape((-2,feature_data.shape[2]))[mask.reshape((-2,mask.shape[2])).flatten()==1,:] | |
return np.mean(reshaped_data,0),np.std(reshaped_data,0) | |
# https://github.com/hankroark/Turbofan-Engine-Degradation/tree/master/CMAPSSData | |
path = '/CMAPSSData/' | |
# Use only datasets where there's a mixture of six different flight conditions... | |
train = [ | |
# np.loadtxt(path+'train_FD001.txt'), | |
np.loadtxt(path+'train_FD002.txt'), | |
# np.loadtxt(path+'train_FD003.txt'), | |
np.loadtxt(path+'train_FD004.txt') | |
] | |
# rul = [ | |
# # np.loadtxt(path+'RUL_FD001.txt'), | |
# np.loadtxt(path+'RUL_FD002.txt'), | |
# # np.loadtxt(path+'RUL_FD003.txt'), | |
# np.loadtxt(path+'RUL_FD004.txt') | |
# ] | |
# dont use 'test' data since it seems like it comes from another distribution | |
# test = [ | |
# # np.loadtxt(path+'test_FD001.txt'), | |
# np.loadtxt(path+'test_FD002.txt'), | |
# # np.loadtxt(path+'test_FD003.txt'), | |
# np.loadtxt(path+'test_FD004.txt') | |
# ] | |
# Shuffle dataset, | |
train = make_one_array(train) | |
truncate_at = 382 | |
start_at = 128 | |
n = seq_len = truncate_at - start_at | |
random.seed(0) | |
np.random.seed(0) | |
feature_data,times_to_event,seq_lengths,mask,n_series,n_vars = load_challenge_data(train,start_at,truncate_at) | |
new_idx = random.sample(xrange(n_series),n_series) | |
test_set_size = n_series/3 | |
feature_data = np.copy(feature_data[:,new_idx,:]) | |
times_to_event = np.copy(times_to_event[:,new_idx,:]) | |
seq_lengths = np.copy(seq_lengths[new_idx]) | |
mask = np.copy(mask[:,new_idx,:]) | |
mean_normalize, std_normalize = get_normalization_coef(feature_data[:,test_set_size:,:],mask[:,test_set_size:,:]) | |
del(train) | |
# Normalize | |
feature_data=mask*(feature_data-mean_normalize)/std_normalize#.reshape([1,1,n_vars]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment