Skip to content

Instantly share code, notes, and snippets.

@jeremy-rutman
Last active January 8, 2020 15:54
Show Gist options
  • Select an option

  • Save jeremy-rutman/1df96f28efd0a0a36a6816b2e09c8baa to your computer and use it in GitHub Desktop.

Select an option

Save jeremy-rutman/1df96f28efd0a0a36a6816b2e09c8baa to your computer and use it in GitHub Desktop.
geyser data analysis
# code for https://unclejerry9466728.wordpress.com/2019/12/28/wiser-geyser/
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torch
import numpy as np
import pandas as pd
df = pd.read_csv('/home/jeremy/PycharmProjects/geysers/Old_Faithful_eruptions.tsv',sep='\t',dtype=str)
df = df.head(15000)
print(df.tail().T)
print(df.columns)
print(len(df))
print(df.describe().T)
def clock_to_sec(intime):
clocktime = intime.replace('min', ':').replace('m', ':').replace('sec', '').replace('+', '').replace('~', '').replace('s', '').replace('<', '').replace('?', '')
parts = clocktime.split(':')
secs_from_minutes = float(parts[0])*60
if len(parts)>1 and parts[1]!='':
secs = int(parts[1])
secs_from_minutes+=secs
# print('in: {} clock: {} parts: {} secs: {}'.format(intime,clocktime,parts,secs_from_minutes))
return secs_from_minutes
df['eruption_time_int']= df['eruption_time_epoch'].astype('int')
df['duration_n']= df['duration'].astype('str')
df['duration_n']= df.apply(lambda r:clock_to_sec(r['duration_n']),axis=1)
df['duration_n']= df.duration_n.astype('float')
df['duration_previous']= df.duration_n.shift(periods=1)
df['duration_previous2']= df.duration_n.shift(periods=2)
df['duration_previous3']= df.duration_n.shift(periods=3)
df['dt1']=df['eruption_time_int']-df['eruption_time_int'].shift(periods=1)
df['dt2']=df['eruption_time_int'].shift(periods=1)-df['eruption_time_int'].shift(periods=2)
df['dt3']=df['eruption_time_int'].shift(periods=2)-df['eruption_time_int'].shift(periods=3)
df['dt4']=df['eruption_time_int'].shift(periods=3)-df['eruption_time_int'].shift(periods=4)
#remove missing vals, large vals (presumably cases where eruption was skipped), 0 vals (not sure why these occur)
print(len(df))
df = df[(df.dt1<7000)&(df.dt1>1000)&(df.dt2<7000)&(df.dt2>1000)&(df.dt3<7000)&(df.dt3>1000)&(df.dt4<7000)&(df.dt4>1000)]
print(len(df))
df = df[(df.dt1.notnull()) & (df.dt2.notnull()) & (df.dt3.notnull()) & (df.dt4.notnull())]
print(len(df))
df = df[(df.duration_previous.notnull())&(df.duration_previous2.notnull())&(df.duration_previous3.notnull())]
print(len(df))
max_dt = max(df.dt1.max(),df.dt2.max())
max_dt = max(max_dt,df.dt3.max())
max_dt = max(max_dt,df.dt4.max())
max_duration = max(df.duration_previous.max(),df.duration_previous2.max())
max_duration = max(max_duration,df.duration_previous3.max())
df['dt1_norm'] = df.dt1/max_dt
df['dt2_norm'] = df.dt2/max_dt
df['dt3_norm'] = df.dt3/max_dt
df['dt4_norm'] = df.dt4/max_dt
df['duration_previous_norm'] = df.duration_previous/max_duration
df['duration_previous_norm2'] = df.duration_previous2/max_duration
df['duration_previous_norm3'] = df.duration_previous3/max_duration
print('maxdt {} maxduration {}'.format(max_dt,max_duration))
#df['previous_duration']=df['duration'].shift(periods=1)
print(df.tail(3).T)
# plot of dt1 vs dt2
plt.plot(df.dt1,df.dt2,'b. ')
plt.xlim(0,11000)
plt.ylim(0,11000)
plt.xlabel('interval n[s]')
plt.ylabel('interval n-1[s]')
plt.title('old faithful eruption intervals')
plt.show()
## histogram of dt
plt.hist(df.dt1,bins=200)
#plt.xlim(2000,6000)
#plt.yscale('log')
#plt.xscale('log')
plt.xlabel('interval[s]')
plt.ylabel('N')
plt.title('old faithful eruption interval hist')
plt.show()
# 3d plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
plt.xlabel('interval n[s]')
plt.ylabel('interval n-1[s]')
ax.set_zlabel('duration[s]')
#plt.zlabel('duration[s]')
plt.title('old faithful eruption intervals\ncolor from duration of preceding event')
ax.scatter(df.dt1, df.dt2, df.duration_previous, c=df.duration_previous)
n=0
for phi in range(60,-60,-10):
for theta in range(90,450,5):
# input('rtc')
ax.view_init(phi,theta)
#
# plt.scatter(df.dt1,df.dt2,df.duration_previous,c=df.duration_previous)
#plt.show()
plt.draw()
plt.pause(0.01)
plt.savefig('{:03d}.jpg'.format(n),dpi=500) #,bbox_inches='tight'
n+=1
## avi from frames
## ffmpeg -r 1/5 -start_number 0 -i "E:\images\01\padlock%3d.png" -c:v libx264 -vf "fps=25,format=yuv420p" e:\out.mp4.
## ffmpeg -framerate 1 -pattern_type glob -i '*.png' -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
## mp4 to gif
## ffmpeg -i opengl-rotating-triangle.mp4 -r 15 -vf scale=512:-1 -ss 00:00:03 -to 00:00:06 opengl-rotating-triangle.gif
## handmade NN
print(df.describe())
features = df[['dt2_norm' , 'duration_previous_norm']].to_numpy()
results = df[['dt1_norm']].to_numpy()
n_train = len(df) - 100
features_train = features[0:-n_train]
features_test = features[-n_train:]
results_train = results[0:-n_train]
results_test = results[-n_train:]
print(features_train.shape,features_test.shape,results_train.shape,results_test.shape)
## 1 hidden layer , no optimize
import torch
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = n_train, 2, 2, 1
# Create random Tensors to hold inputs and outputs
# x = torch.randn(N, D_in)
# y = torch.randn(N, D_out)
#.float()
x = torch.from_numpy(features_train).float()
y = torch.from_numpy(results_train).float()
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out)
)
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
#print('{} params'.format(len(model.parameters())))
print(1000*100+1000+100*10+10)
for t in range(1000):
# Forward pass: compute predicted y by passing x to the model. Module objects
# override the __call__ operator so you can call them like functions. When
# doing so you pass a Tensor of input data to the Module and it produces
# a Tensor of output data.
y_pred = model(x)
# Compute and print loss. We pass Tensors containing the predicted and true
# values of y, and the loss function returns a Tensor containing the
# loss.
loss = loss_fn(y_pred, y)
# if t % 10 == 9:
print(t, loss.item())
# Zero the gradients before running the backward pass.
model.zero_grad()
# Backward pass: compute gradient of the loss with respect to all the learnable
# parameters of the model. Internally, the parameters of each Module are stored
# in Tensors with requires_grad=True, so this call will compute gradients for
# all learnable parameters in the model.
loss.backward()
# Update the weights using gradient descent. Each parameter is a Tensor, so
# we can access its gradients like we did before.
with torch.no_grad():
for param in model.parameters():
param -= learning_rate * param.grad
x = torch.from_numpy(features_test).float()
y = torch.from_numpy(results_test).float()
y_pred = model(x)
loss = loss_fn(y_pred, y).item()
print(loss,loss/y.shape[0])
print(x[0:5])
print(y[0:5])
print(y_pred[0:5])
# df['dt1_norm'] = df.dt1/df.dt1.max()
# df['dt2_norm'] = df.dt2/df.dt2.max()
# df['duration_previous_norm'] = df.duration_previous/df.duration_previous.max()
y_denorm = y*df.dt1.max()
y_pred_denorm = y_pred*df.dt1.max()
y_vec = y_denorm.data.numpy()[:,0]
y_pred_vec = y_pred_denorm.data.numpy()[:,0]
print(y_denorm[0:5])
print(y[0:5])
print(y_pred[0:5])
loss_by_hand = np.dot(y_vec-y_pred_vec,y_vec-y_pred_vec)
loss_fraction = np.dot((y_vec-y_pred_vec)/y_vec,(y_vec-y_pred_vec)/y_vec)
print('lossbyhabd',loss_by_hand,np.sqrt(loss_by_hand/y_vec.shape[0]))
print('losspercent',loss_fraction,np.sqrt(loss_fraction/y_vec.shape[0]))
plt.plot(y_vec,y_pred_vec,'b. ')
plt.xlim([0,6000])
plt.ylim([0,6000])
plt.xlabel('actual interval')
plt.xlabel('predicted interval')
plt.title('2 hidden neurons, sgd by hand')
plt.show()
## use dt1,dt2, dur1, dur2 , 2 hidden layer with optimizer
print('hidden layer using adam')
#N, D_in, H1,D_out = n_train, 4, 20, 1
features = df[['dt2_norm' ,'dt3_norm','dt4_norm','duration_previous_norm','duration_previous_norm2','duration_previous_norm3']].to_numpy()
results = df[['dt1_norm']].to_numpy()
n_inputs = features.shape[1]
print('{} features'.format(n_inputs))
#N, D_in, H1,H2,H3, D_out = n_train, n_inputs, 20, 1
N, D_in, H1,H2, D_out = n_train, n_inputs, 30,30, 1
features_train = features
results_train = results
features_test = features
results_test = results
x = torch.from_numpy(features_train).float()
y = torch.from_numpy(results_train).float()
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H1),
torch.nn.ReLU(),
torch.nn.Linear(H1, H2),
torch.nn.ReLU(),
# torch.nn.Linear(H2, H3),
# torch.nn.ReLU(),
# torch.nn.Linear(H3, D_out),
torch.nn.Linear(H2, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
n_steps = 100000
for t in range(n_steps):
# Forward pass: compute predicted y by passing x to the model.
y_pred = model(x)
# Compute and print loss.
loss = loss_fn(y_pred, y)
if t % 1000 == 999:
print(t, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
x = torch.from_numpy(features_test).float()
y = torch.from_numpy(results_test).float()
y_pred = model(x)
loss = loss_fn(y_pred, y).item()
print(model.parameters())
print('loss',loss,loss/y.shape[0])
print('x',x[0:5])
print('y',y[0:5])
print('ypred',y_pred[0:5])
y_orig = df.dt1.to_numpy()
y_denorm = y*df.dt1.max()
y_pred_denorm = y_pred*df.dt1.max()
print('ydenorm shape {} y_pred shape {}'.format(y_denorm.shape,y_pred_denorm.shape))
y_vec = y_denorm.data.numpy()[:,0]
y_pred_vec = y_pred_denorm.data.numpy()[:,0]
print('y denorm',y_denorm[0:5])
print('y orig',y_orig[0:5])
print('y pred denorm',y_pred_denorm[0:5])
print('yshape',y.shape)
print(y_pred[0:5])
loss2 = loss_fn(y_pred_denorm, y_denorm).item()
print('loss2',loss2,loss2/y.shape[0])
loss_by_hand = np.dot(y_vec-y_pred_vec,y_vec-y_pred_vec)
loss_fraction = np.dot((y_vec-y_pred_vec)/y_vec,(y_vec-y_pred_vec)/y_vec)
print('lossbyhabd',loss_by_hand,np.sqrt(loss_by_hand/y_vec.shape[0]))
print('losspercent',loss_fraction,np.sqrt(loss_fraction/y_vec.shape[0]))
plt.plot(y_vec,y_pred_vec,'b. ')
plt.xlim([0,6000])
plt.ylim([0,6000])
plt.xlabel('actual interval')
plt.ylabel('predicted interval')
#plt.title('{}/{}/{}/{}/1, adam optimizer {} steps lr={} \nrms error {}s'.format(D_in, H1,H2,H3,n_steps,learning_rate,np.sqrt(loss_by_hand/y_vec.shape[0])))
#plt.title('{}/{}/{}/1, adam optimizer {} steps lr={} \nrms error {}s'.format(D_in, H1,H2,n_steps,learning_rate,np.sqrt(loss_by_hand/y_vec.shape[0])))
plt.title('{}/{}/1 {} features, adam optimizer {} steps lr={} \nrms error {}s'.format(D_in, H1,n_inputs,n_steps,learning_rate,np.sqrt(loss_by_hand/y_vec.shape[0])))
plt.show()
dy = y_vec - y_pred_vec
plt.hist(dy,bins=100)
plt.title('error historgram')
plt.xlabel('actual-predicted interval[s]')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment