Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save devishot/829ad187d58d0d5084d47a562fb14d90 to your computer and use it in GitHub Desktop.
Save devishot/829ad187d58d0d5084d47a562fb14d90 to your computer and use it in GitHub Desktop.
Google Machine Learning Crash Course: Linear regression > taxi fare model training exercise
Display the source blob
Display the rendered blob
Raw
#@title Code - Load dependencies
#general
import io
# data
import numpy as np
import pandas as pd
# machine learning
import keras
# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
# @title Load the dataset
chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")
#@title Code - Read dataset
# Updates dataframe to use specific columns.
training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]
print('Read dataset completed successfully.')
print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
training_df.head(200)
#@title Code - View dataset statistics
print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
training_df.describe(include='all')
#@title Double-click or run to view answers about dataset statistics
answer = '''
What is the maximum fare? Answer: $159.25
What is the mean distance across all trips? Answer: 8.2895 miles
How many cab companies are in the dataset? Answer: 31
What is the most frequent payment type? Answer: Credit Card
Are any features missing data? Answer: No
'''
# You should be able to find the answers to the questions about the dataset
# by inspecting the table output after running the DataFrame describe method.
#
# Run this code cell to verify your answers.
# What is the maximum fare?
max_fare = training_df['FARE'].max()
print("What is the maximum fare? \t\t\t\tAnswer: ${fare:.2f}".format(fare = max_fare))
# What is the mean distance across all trips?
mean_distance = training_df['TRIP_MILES'].mean()
print("What is the mean distance across all trips? \t\tAnswer: {mean:.4f} miles".format(mean = mean_distance))
# How many cab companies are in the dataset?
num_unique_companies = training_df['COMPANY'].nunique()
print("How many cab companies are in the dataset? \t\tAnswer: {number}".format(number = num_unique_companies))
# What is the most frequent payment type?
most_freq_payment_type = training_df['PAYMENT_TYPE'].value_counts().idxmax()
print("What is the most frequent payment type? \t\tAnswer: {type}".format(type = most_freq_payment_type))
# Are any features missing data?
missing_values = training_df.isnull().sum().sum()
print("Are any features missing data? \t\t\t\tAnswer:", "No" if missing_values == 0 else "Yes")
#@title Code - View correlation matrix
training_df.corr(numeric_only = True)
#@title Double-click to view answers about the correlation matrix
# Which feature correlates most strongly to the label FARE?
# ---------------------------------------------------------
answer = '''
The feature with the strongest correlation to the FARE is TRIP_MILES.
As you might expect, TRIP_MILES looks like a good feature to start with to train
the model. Also, notice that the feature TRIP_SECONDS has a strong correlation
with fare too.
'''
print(answer)
# Which feature correlates least strongly to the label FARE?
# -----------------------------------------------------------
answer = '''The feature with the weakest correlation to the FARE is TIP_RATE.'''
print(answer)
#@title Code - View pairplot
sns.pairplot(training_df, x_vars=["FARE", "TRIP_MILES", "TRIP_SECONDS"], y_vars=["FARE", "TRIP_MILES", "TRIP_SECONDS"])
#@title Define plotting functions
def make_plots(df, feature_names, label_name, model_output, sample_size=200):
random_sample = df.sample(n=sample_size).copy()
random_sample.reset_index()
weights, bias, epochs, rmse = model_output
is_2d_plot = len(feature_names) == 1
model_plot_type = "scatter" if is_2d_plot else "surface"
fig = make_subplots(rows=1, cols=2,
subplot_titles=("Loss Curve", "Model Plot"),
specs=[[{"type": "scatter"}, {"type": model_plot_type}]])
plot_data(random_sample, feature_names, label_name, fig)
plot_model(random_sample, feature_names, weights, bias, fig)
plot_loss_curve(epochs, rmse, fig)
fig.show()
return
def plot_loss_curve(epochs, rmse, fig):
curve = px.line(x=epochs, y=rmse)
curve.update_traces(line_color='#ff0000', line_width=3)
fig.append_trace(curve.data[0], row=1, col=1)
fig.update_xaxes(title_text="Epoch", row=1, col=1)
fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])
return
def plot_data(df, features, label, fig):
if len(features) == 1:
scatter = px.scatter(df, x=features[0], y=label)
else:
scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)
fig.append_trace(scatter.data[0], row=1, col=2)
if len(features) == 1:
fig.update_xaxes(title_text=features[0], row=1, col=2)
fig.update_yaxes(title_text=label, row=1, col=2)
else:
fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))
return
def plot_model(df, features, weights, bias, fig):
df['FARE_PREDICTED'] = bias[0]
for index, feature in enumerate(features):
df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]
if len(features) == 1:
model = px.line(df, x=features[0], y='FARE_PREDICTED')
model.update_traces(line_color='#ff0000', line_width=3)
else:
z_name, y_name = "FARE_PREDICTED", features[1]
z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
x = []
for i in range(len(y)):
x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])
plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})
light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
colorscale=light_yellow))
fig.add_trace(model.data[0], row=1, col=2)
return
def model_info(feature_names, label_name, model_output):
weights = model_output[0]
bias = model_output[1]
nl = "\n"
header = "-" * 80
banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header
info = ""
equation = label_name + " = "
for index, feature in enumerate(feature_names):
info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)
info = info + "Bias: {:.3f}\n".format(bias[0])
equation = equation + "{:.3f}\n".format(bias[0])
return banner + nl + info + nl + equation
print("SUCCESS: defining plotting functions complete.")
#@title Code - Define ML functions
def build_model(my_learning_rate, num_features):
"""Create and compile a simple linear regression model."""
# Describe the topography of the model.
# The topography of a simple linear regression model
# is a single node in a single layer.
inputs = keras.Input(shape=(num_features,))
outputs = keras.layers.Dense(units=1)(inputs)
model = keras.Model(inputs=inputs, outputs=outputs)
# Compile the model topography into code that Keras can efficiently
# execute. Configure training to minimize the model's mean squared error.
model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=my_learning_rate),
loss="mean_squared_error",
metrics=[keras.metrics.RootMeanSquaredError()])
return model
def train_model(model, df, features, label, epochs, batch_size):
"""Train the model by feeding it data."""
# Feed the model the feature and the label.
# The model will train for the specified number of epochs.
# input_x = df.iloc[:,1:3].values
# df[feature]
history = model.fit(x=features,
y=label,
batch_size=batch_size,
epochs=epochs)
# Gather the trained model's weight and bias.
trained_weight = model.get_weights()[0]
trained_bias = model.get_weights()[1]
# The list of epochs is stored separately from the rest of history.
epochs = history.epoch
# Isolate the error for each epoch.
hist = pd.DataFrame(history.history)
# To track the progression of training, we're going to take a snapshot
# of the model's root mean squared error at each epoch.
rmse = hist["root_mean_squared_error"]
return trained_weight, trained_bias, epochs, rmse
def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):
print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))
num_features = len(feature_names)
features = df.loc[:, feature_names].values
label = df[label_name].values
model = build_model(learning_rate, num_features)
model_output = train_model(model, df, features, label, epochs, batch_size)
print('\nSUCCESS: training experiment complete\n')
print('{}'.format(model_info(feature_names, label_name, model_output)))
make_plots(df, feature_names, label_name, model_output)
return model
print("SUCCESS: defining linear regression functions complete.")
#@title Code - Experiment 1
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 50
# Specify the feature and the label.
features = ['TRIP_MILES']
label = 'FARE'
model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)
#@title Double-click to view answers for training model with one feature
# How many epochs did it take to converge on the final model?
# -----------------------------------------------------------------------------
answer = """
Use the loss curve to see where the loss begins to level off during training.
With this set of hyperparameters:
learning_rate = 0.001
epochs = 20
batch_size = 50
it takes about 5 epochs for the training run to converge to the final model.
"""
print(answer)
# How well does the model fit the sample data?
# -----------------------------------------------------------------------------
answer = '''
It appears from the model plot that the model fits the sample data fairly well.
'''
print(answer)
#@title Experiment with hyperparameters
It is common with machine learning to run multiple experiments to find the best set of hyperparmeters to train your model.
# -----------------------------------------------------------------------------
In this step, try varying the hyperparameters one by one with this set of experiments:
Experiment 1: Increase the learning rate to 1 (batch size at 50).
Experiment 2: Decrease the learning rate to 0.0001 (batch size at 50).
Experiment 3: Increase the batch size to 500 (learning rate at 0.001).
#@title Double-click to view answers for hyperparameter experiments
# How did raising the learning rate impact your ability to train the model?
# -----------------------------------------------------------------------------
answer = """
When the learning rate is too high, the loss curve bounces around and does not
appear to be moving towards convergence with each iteration. Also, notice that
the predicted model does not fit the data very well. With a learning rate that
is too high, it is unlikely that you will be able to train a model with good
results.
"""
print(answer)
# How did lowering the learning rate impact your ability to train the model?
# -----------------------------------------------------------------------------
answer = '''
When the learning rate is too small, it may take longer for the loss curve to
converge. With a small learning rate the loss curve decreases slowly, but does
not show a dramatic drop or leveling off. With a small learning rate you could
increase the number of epochs so that your model will eventually converge, but
it will take longer.
'''
print(answer)
# Did changing the batch size effect your training results?
# -----------------------------------------------------------------------------
answer = '''
Increasing the batch size makes each epoch run faster, but as with the smaller
learning rate, the model does not converge with just 20 epochs. If you have
time, try increasing the number of epochs and eventually you should see the
model converge.
'''
print(answer)
#@title Code - Experiment 3
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 50
training_df.loc[:, 'TRIP_MINUTES'] = training_df['TRIP_SECONDS']/60
features = ['TRIP_MILES', 'TRIP_MINUTES']
label = 'FARE'
model_2 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)
#@title Double-click to view answers for training with two features
# Does the model with two features produce better results than one using a
# single feature?
# -----------------------------------------------------------------------------
answer = '''
To answer this question for your specific training runs, compare the RMSE for
each model. For example, if the RMSE for the model trained with one feature was
3.7457 and the RMSE for the model with two features is 3.4787, that means that
on average the model with two features makes predictions that are about $0.27
closer to the observed fare.
'''
print(answer)
# Does it make a difference if you use TRIP_SECONDS instead of TRIP_MILES?
# -----------------------------------------------------------------------------
answer = '''
When training a model with more than one feature, it is important that all
numeric values are roughly on the same scale. In this case, TRIP_SECONDS and
TRIP_MILES do not meet this criteria. The mean value for TRIP_MILES is 8.3 and
the mean for TRIP_SECONDS is 1320; that is two orders of magnitude difference.
Converting the trip duration to minutes helps during training because in puts
values for both features on a more comparable scale. Of course, this is not the
only way to scale values before training, but you will learn about that in
another module.
'''
print(answer)
# How well do you think the model comes to the ground truth fare calculation for
# Chicago taxi trips?
# -----------------------------------------------------------------------------
answer = '''
In reality, Chicago taxi cabs use a documented formula to determine cab fares.
For a single passenger paying cash, the fare is calculated like this:
FARE = 2.25 * TRIP_MILES + 0.12 * TRIP_MINUTES + 3.25
Typically with machine learning problems you would not know the 'correct'
formula, but in this case you can this knowledge to evaluate your model. Take a
look at your model output (the weights and bias) and determine how well it
matches the ground truth fare calculation. You should find that the model is
roughly close to this formula.
'''
print(answer)
#@title Code - Define functions to make predictions
def format_currency(x):
return "${:.2f}".format(x)
def build_batch(df, batch_size):
batch = df.sample(n=batch_size).copy()
batch.set_index(np.arange(batch_size), inplace=True)
return batch
def predict_fare(model, df, features, label, batch_size=50):
batch = build_batch(df, batch_size)
predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)
data = {"PREDICTED_FARE": [], "OBSERVED_FARE": [], "L1_LOSS": [],
features[0]: [], features[1]: []}
for i in range(batch_size):
predicted = predicted_values[i][0]
observed = batch.at[i, label]
data["PREDICTED_FARE"].append(format_currency(predicted))
data["OBSERVED_FARE"].append(format_currency(observed))
data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
data[features[0]].append(batch.at[i, features[0]])
data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))
output_df = pd.DataFrame(data)
return output_df
def show_predictions(output):
header = "-" * 80
banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
print(banner)
print(output)
return
#@title Code - Make predictions
output = predict_fare(model_2, training_df, features, label)
show_predictions(output)
#@title Double-click to view answers for validate model
# How close is the predicted value to the label value?
# -----------------------------------------------------------------------------
answer = '''
Based on a random sampling of examples, the model seems to do pretty well
predicting the fare for a taxi ride. Most of the predicted values do not vary
significantly from the observed value. You should be able to see this by looking
at the column L1_LOSS = |observed - predicted|.
'''
print(answer)
INFO: starting training experiment with features=['TRIP_MILES', 'TRIP_MINUTES'] and label=FARE
Epoch 1/20
<ipython-input-19-8892146dfcd1>:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3884.8274 - root_mean_squared_error: 62.2386
Epoch 2/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 1510.9357 - root_mean_squared_error: 38.7442
Epoch 3/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 273.6273 - root_mean_squared_error: 16.3169
Epoch 4/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 24.8084 - root_mean_squared_error: 4.9753
Epoch 5/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 15.6657 - root_mean_squared_error: 3.9543
Epoch 6/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 12.2236 - root_mean_squared_error: 3.4927
Epoch 7/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 11.2817 - root_mean_squared_error: 3.3472
Epoch 8/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.3543 - root_mean_squared_error: 3.5082
Epoch 9/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.9035 - root_mean_squared_error: 3.5800
Epoch 10/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.3703 - root_mean_squared_error: 3.3635
Epoch 11/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.7864 - root_mean_squared_error: 3.5701
Epoch 12/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.7018 - root_mean_squared_error: 3.5578
Epoch 13/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.3804 - root_mean_squared_error: 3.5061
Epoch 14/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.4949 - root_mean_squared_error: 3.3811
Epoch 15/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 13.1091 - root_mean_squared_error: 3.6081
Epoch 16/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 11.4416 - root_mean_squared_error: 3.3735
Epoch 17/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 10.9564 - root_mean_squared_error: 3.3044
Epoch 18/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.4541 - root_mean_squared_error: 3.5195
Epoch 19/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.0418 - root_mean_squared_error: 3.3155
Epoch 20/20
634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.2511 - root_mean_squared_error: 3.4926
SUCCESS: training experiment complete
--------------------------------------------------------------------------------
| MODEL INFO |
--------------------------------------------------------------------------------
Weight for feature[TRIP_MILES]: 2.030
Weight for feature[TRIP_MINUTES]: 0.142
Bias: 3.828
FARE = 2.030 * TRIP_MILES + 0.142 * TRIP_MINUTES + 3.828
--------------------------------------------------------------------------------
| PREDICTIONS |
--------------------------------------------------------------------------------
PREDICTED_FARE OBSERVED_FARE L1_LOSS TRIP_MILES TRIP_MINUTES
0 $18.33 $18.25 $0.08 4.90 32.00
1 $45.11 $43.25 $1.86 17.37 42.30
2 $31.50 $31.75 $0.25 11.98 23.52
3 $27.43 $27.00 $0.43 9.77 26.47
4 $74.23 $76.75 $2.52 31.06 51.58
5 $7.25 $9.00 $1.75 1.26 6.03
6 $5.96 $5.75 $0.21 0.70 5.00
7 $6.95 $9.00 $2.05 1.00 7.65
8 $33.96 $34.00 $0.04 12.60 32.00
9 $27.32 $26.00 $1.32 9.21 33.63
10 $44.20 $43.25 $0.95 17.54 33.48
11 $45.33 $45.00 $0.33 18.20 32.00
12 $17.74 $17.25 $0.49 5.20 23.58
13 $30.70 $30.50 $0.20 11.20 29.00
14 $43.36 $43.50 $0.14 17.30 31.00
15 $19.37 $19.25 $0.12 6.81 12.03
16 $17.82 $17.25 $0.57 5.70 17.00
17 $16.05 $15.75 $0.30 4.90 16.00
18 $9.01 $8.25 $0.76 2.20 5.00
19 $6.10 $5.75 $0.35 0.79 4.68
20 $17.17 $17.50 $0.33 5.10 21.00
21 $6.34 $5.75 $0.59 0.85 5.53
22 $41.13 $43.80 $2.67 14.94 48.95
23 $40.63 $41.25 $0.62 16.74 19.78
24 $46.86 $46.50 $0.36 17.90 47.00
25 $7.06 $6.75 $0.31 1.10 7.00
26 $21.07 $20.50 $0.57 7.30 17.00
27 $7.20 $6.75 $0.45 1.13 7.57
28 $19.21 $19.50 $0.29 6.13 20.65
29 $41.56 $40.50 $1.06 15.50 44.00
30 $31.48 $31.75 $0.27 11.62 28.50
31 $24.60 $24.25 $0.35 8.90 19.00
32 $17.99 $18.00 $0.01 5.66 18.78
33 $22.49 $22.50 $0.01 7.70 21.27
34 $22.16 $21.00 $1.16 6.44 36.92
35 $9.50 $9.00 $0.50 2.09 10.05
36 $24.32 $24.50 $0.18 8.50 22.70
37 $30.02 $30.25 $0.23 11.50 20.00
38 $43.81 $45.00 $1.19 17.80 27.02
39 $9.08 $9.00 $0.08 1.80 11.23
40 $5.62 $5.25 $0.37 0.60 4.00
41 $6.17 $5.50 $0.67 0.88 3.92
42 $14.91 $14.50 $0.41 4.22 17.68
43 $6.36 $5.25 $1.11 0.84 5.82
44 $15.60 $14.93 $0.67 4.74 15.12
45 $28.72 $36.00 $7.28 8.06 59.92
46 $21.79 $20.50 $1.29 7.26 22.63
47 $12.50 $12.00 $0.50 3.50 11.00
48 $9.39 $9.00 $0.39 2.27 6.67
49 $16.36 $15.75 $0.61 5.40 11.00
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment