devishot · February 6, 2025 07:52
diff --git a/google_course_ML_concepts_linear_regression_taxi.ipynb b/google_course_ML_concepts_linear_regression_taxi.ipynb
 #@title Code - Load dependencies

 #general
 import io

 # data
 import numpy as np
 import pandas as pd

 # machine learning
 import keras

 # data visualization
 import plotly.express as px
 from plotly.subplots import make_subplots
 import plotly.graph_objects as go
 import seaborn as sns


 # @title Load the dataset
 chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")


 #@title Code - Read dataset

 # Updates dataframe to use specific columns.
 training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]

 print('Read dataset completed successfully.')
 print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
 training_df.head(200)


 #@title Code - View dataset statistics

 print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
 training_df.describe(include='all')


 #@title Double-click or run to view answers about dataset statistics

 answer = '''
 What is the maximum fare? 				              Answer: $159.25
 What is the mean distance across all trips? 		Answer: 8.2895 miles
 How many cab companies are in the dataset? 		  Answer: 31
 What is the most frequent payment type? 		    Answer: Credit Card
 Are any features missing data? 				          Answer: No
 '''

 # You should be able to find the answers to the questions about the dataset
 # by inspecting the table output after running the DataFrame describe method.
 #
 # Run this code cell to verify your answers.

 # What is the maximum fare?
 max_fare = training_df['FARE'].max()
 print("What is the maximum fare? \t\t\t\tAnswer: ${fare:.2f}".format(fare = max_fare))

 # What is the mean distance across all trips?
 mean_distance = training_df['TRIP_MILES'].mean()
 print("What is the mean distance across all trips? \t\tAnswer: {mean:.4f} miles".format(mean = mean_distance))

 # How many cab companies are in the dataset?
 num_unique_companies =  training_df['COMPANY'].nunique()
 print("How many cab companies are in the dataset? \t\tAnswer: {number}".format(number = num_unique_companies))

 # What is the most frequent payment type?
 most_freq_payment_type = training_df['PAYMENT_TYPE'].value_counts().idxmax()
 print("What is the most frequent payment type? \t\tAnswer: {type}".format(type = most_freq_payment_type))

 # Are any features missing data?
 missing_values = training_df.isnull().sum().sum()
 print("Are any features missing data? \t\t\t\tAnswer:", "No" if missing_values == 0 else "Yes")


 #@title Code - View correlation matrix
 training_df.corr(numeric_only = True)


 #@title Double-click to view answers about the correlation matrix

 # Which feature correlates most strongly to the label FARE?
 # ---------------------------------------------------------
 answer = '''
 The feature with the strongest correlation to the FARE is TRIP_MILES.
 As you might expect, TRIP_MILES looks like a good feature to start with to train
 the model. Also, notice that the feature TRIP_SECONDS has a strong correlation
 with fare too.
 '''
 print(answer)

 # Which feature correlates least strongly to the label FARE?
 # -----------------------------------------------------------
 answer = '''The feature with the weakest correlation to the FARE is TIP_RATE.'''
 print(answer)


 #@title Code - View pairplot
 sns.pairplot(training_df, x_vars=["FARE", "TRIP_MILES", "TRIP_SECONDS"], y_vars=["FARE", "TRIP_MILES", "TRIP_SECONDS"])


 #@title Define plotting functions

 def make_plots(df, feature_names, label_name, model_output, sample_size=200):

  random_sample = df.sample(n=sample_size).copy()
  random_sample.reset_index()
  weights, bias, epochs, rmse = model_output

  is_2d_plot = len(feature_names) == 1
  model_plot_type = "scatter" if is_2d_plot else "surface"
  fig = make_subplots(rows=1, cols=2,
                      subplot_titles=("Loss Curve", "Model Plot"),
                      specs=[[{"type": "scatter"}, {"type": model_plot_type}]])

  plot_data(random_sample, feature_names, label_name, fig)
  plot_model(random_sample, feature_names, weights, bias, fig)
  plot_loss_curve(epochs, rmse, fig)

  fig.show()
  return

 def plot_loss_curve(epochs, rmse, fig):
  curve = px.line(x=epochs, y=rmse)
  curve.update_traces(line_color='#ff0000', line_width=3)

  fig.append_trace(curve.data[0], row=1, col=1)
  fig.update_xaxes(title_text="Epoch", row=1, col=1)
  fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])

  return

 def plot_data(df, features, label, fig):
  if len(features) == 1:
    scatter = px.scatter(df, x=features[0], y=label)
  else:
    scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)

  fig.append_trace(scatter.data[0], row=1, col=2)
  if len(features) == 1:
    fig.update_xaxes(title_text=features[0], row=1, col=2)
    fig.update_yaxes(title_text=label, row=1, col=2)
  else:
    fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))

  return

 def plot_model(df, features, weights, bias, fig):
  df['FARE_PREDICTED'] = bias[0]

  for index, feature in enumerate(features):
    df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]

  if len(features) == 1:
    model = px.line(df, x=features[0], y='FARE_PREDICTED')
    model.update_traces(line_color='#ff0000', line_width=3)
  else:
    z_name, y_name = "FARE_PREDICTED", features[1]
    z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
    y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
    x = []
    for i in range(len(y)):
      x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])

    plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})

    light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
    model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
                                      colorscale=light_yellow))

  fig.add_trace(model.data[0], row=1, col=2)

  return

 def model_info(feature_names, label_name, model_output):
  weights = model_output[0]
  bias = model_output[1]

  nl = "\n"
  header = "-" * 80
  banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header

  info = ""
  equation = label_name + " = "

  for index, feature in enumerate(feature_names):
    info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
    equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)

  info = info + "Bias: {:.3f}\n".format(bias[0])
  equation = equation + "{:.3f}\n".format(bias[0])

  return banner + nl + info + nl + equation

 print("SUCCESS: defining plotting functions complete.")


 #@title Code - Define ML functions

 def build_model(my_learning_rate, num_features):
  """Create and compile a simple linear regression model."""
  # Describe the topography of the model.
  # The topography of a simple linear regression model
  # is a single node in a single layer.
  inputs = keras.Input(shape=(num_features,))
  outputs = keras.layers.Dense(units=1)(inputs)
  model = keras.Model(inputs=inputs, outputs=outputs)

  # Compile the model topography into code that Keras can efficiently
  # execute. Configure training to minimize the model's mean squared error.
  model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[keras.metrics.RootMeanSquaredError()])

  return model


 def train_model(model, df, features, label, epochs, batch_size):
  """Train the model by feeding it data."""

  # Feed the model the feature and the label.
  # The model will train for the specified number of epochs.
  # input_x = df.iloc[:,1:3].values
  # df[feature]
  history = model.fit(x=features,
                      y=label,
                      batch_size=batch_size,
                      epochs=epochs)

  # Gather the trained model's weight and bias.
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch

  # Isolate the error for each epoch.
  hist = pd.DataFrame(history.history)

  # To track the progression of training, we're going to take a snapshot
  # of the model's root mean squared error at each epoch.
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse


 def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):

  print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))

  num_features = len(feature_names)

  features = df.loc[:, feature_names].values
  label = df[label_name].values

  model = build_model(learning_rate, num_features)
  model_output = train_model(model, df, features, label, epochs, batch_size)

  print('\nSUCCESS: training experiment complete\n')
  print('{}'.format(model_info(feature_names, label_name, model_output)))
  make_plots(df, feature_names, label_name, model_output)

  return model

 print("SUCCESS: defining linear regression functions complete.")


 #@title Code - Experiment 1

 # The following variables are the hyperparameters.
 learning_rate = 0.001
 epochs = 20
 batch_size = 50

 # Specify the feature and the label.
 features = ['TRIP_MILES']
 label = 'FARE'

 model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)


 #@title Double-click to view answers for training model with one feature

 # How many epochs did it take to converge on the final model?
 # -----------------------------------------------------------------------------
 answer = """
 Use the loss curve to see where the loss begins to level off during training.

 With this set of hyperparameters:

  learning_rate = 0.001
  epochs = 20
  batch_size = 50

 it takes about 5 epochs for the training run to converge to the final model.
 """
 print(answer)

 # How well does the model fit the sample data?
 # -----------------------------------------------------------------------------
 answer = '''
 It appears from the model plot that the model fits the sample data fairly well.
 '''
 print(answer)

 #@title Experiment with hyperparameters

 It is common with machine learning to run multiple experiments to find the best set of hyperparmeters to train your model. 
 # -----------------------------------------------------------------------------
 In this step, try varying the hyperparameters one by one with this set of experiments:

 Experiment 1: Increase the learning rate to 1 (batch size at 50).
 Experiment 2: Decrease the learning rate to 0.0001 (batch size at 50).
 Experiment 3: Increase the batch size to 500 (learning rate at 0.001).


 #@title Double-click to view answers for hyperparameter experiments

 # How did raising the learning rate impact your ability to train the model?
 # -----------------------------------------------------------------------------
 answer = """
 When the learning rate is too high, the loss curve bounces around and does not
 appear to be moving towards convergence with each iteration. Also, notice that
 the predicted model does not fit the data very well. With a learning rate that
 is too high, it is unlikely that you will be able to train a model with good
 results.
 """
 print(answer)

 # How did lowering the learning rate impact your ability to train the model?
 # -----------------------------------------------------------------------------
 answer = '''
 When the learning rate is too small, it may take longer for the loss curve to
 converge. With a small learning rate the loss curve decreases slowly, but does
 not show a dramatic drop or leveling off. With a small learning rate you could
 increase the number of epochs so that your model will eventually converge, but
 it will take longer.
 '''
 print(answer)

 # Did changing the batch size effect your training results?
 # -----------------------------------------------------------------------------
 answer = '''
 Increasing the batch size makes each epoch run faster, but as with the smaller
 learning rate, the model does not converge with just 20 epochs. If you have
 time, try increasing the number of epochs and eventually you should see the
 model converge.
 '''
 print(answer)


 #@title Code - Experiment 3

 # The following variables are the hyperparameters.
 learning_rate = 0.001
 epochs = 20
 batch_size = 50

 training_df.loc[:, 'TRIP_MINUTES'] = training_df['TRIP_SECONDS']/60

 features = ['TRIP_MILES', 'TRIP_MINUTES']
 label = 'FARE'

 model_2 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)


 #@title Double-click to view answers for training with two features

 # Does the model with two features produce better results than one using a
 # single feature?
 # -----------------------------------------------------------------------------
 answer = '''
 To answer this question for your specific training runs, compare the RMSE for
 each model. For example, if the RMSE for the model trained with one feature was
 3.7457 and the RMSE for the model with two features is 3.4787, that means that
 on average the model with two features makes predictions that are about $0.27
 closer to the observed fare.

 '''
 print(answer)

 # Does it make a difference if you use TRIP_SECONDS instead of TRIP_MILES?
 # -----------------------------------------------------------------------------
 answer = '''
 When training a model with more than one feature, it is important that all
 numeric values are roughly on the same scale. In this case, TRIP_SECONDS and
 TRIP_MILES do not meet this criteria. The mean value for TRIP_MILES is 8.3 and
 the mean for TRIP_SECONDS is 1320; that is two orders of magnitude difference.
 Converting the trip duration to minutes helps during training because in puts
 values for both features on a more comparable scale. Of course, this is not the
 only way to scale values before training, but you will learn about that in
 another module.

 '''
 print(answer)

 # How well do you think the model comes to the ground truth fare calculation for
 # Chicago taxi trips?
 # -----------------------------------------------------------------------------
 answer = '''
 In reality, Chicago taxi cabs use a documented formula to determine cab fares.
 For a single passenger paying cash, the fare is calculated like this:

 FARE = 2.25 * TRIP_MILES + 0.12 * TRIP_MINUTES + 3.25

 Typically with machine learning problems you would not know the 'correct'
 formula, but in this case you can this knowledge to evaluate your model. Take a
 look at your model output (the weights and bias) and determine how well it
 matches the ground truth fare calculation. You should find that the model is
 roughly close to this formula.
 '''
 print(answer)


 #@title Code - Define functions to make predictions
 def format_currency(x):
  return "${:.2f}".format(x)

 def build_batch(df, batch_size):
  batch = df.sample(n=batch_size).copy()
  batch.set_index(np.arange(batch_size), inplace=True)
  return batch

 def predict_fare(model, df, features, label, batch_size=50):
  batch = build_batch(df, batch_size)
  predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)

  data = {"PREDICTED_FARE": [], "OBSERVED_FARE": [], "L1_LOSS": [],
          features[0]: [], features[1]: []}
  for i in range(batch_size):
    predicted = predicted_values[i][0]
    observed = batch.at[i, label]
    data["PREDICTED_FARE"].append(format_currency(predicted))
    data["OBSERVED_FARE"].append(format_currency(observed))
    data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
    data[features[0]].append(batch.at[i, features[0]])
    data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))

  output_df = pd.DataFrame(data)
  return output_df

 def show_predictions(output):
  header = "-" * 80
  banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
  print(banner)
  print(output)
  return
  
  
 #@title Code - Make predictions

 output = predict_fare(model_2, training_df, features, label)
 show_predictions(output)


 #@title Double-click to view answers for validate model

 # How close is the predicted value to the label value?
 # -----------------------------------------------------------------------------
 answer = '''
 Based on a random sampling of examples, the model seems to do pretty well
 predicting the fare for a taxi ride. Most of the predicted values do not vary
 significantly from the observed value. You should be able to see this by looking
 at the column L1_LOSS = |observed - predicted|.
 '''
 print(answer)
diff --git a/output_running_code_for_experiment_3.txt b/output_running_code_for_experiment_3.txt
 INFO: starting training experiment with features=['TRIP_MILES', 'TRIP_MINUTES'] and label=FARE

 Epoch 1/20
 <ipython-input-19-8892146dfcd1>:8: SettingWithCopyWarning:


 A value is trying to be set on a copy of a slice from a DataFrame.
 Try using .loc[row_indexer,col_indexer] = value instead

 See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3884.8274 - root_mean_squared_error: 62.2386
 Epoch 2/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 1510.9357 - root_mean_squared_error: 38.7442
 Epoch 3/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 273.6273 - root_mean_squared_error: 16.3169
 Epoch 4/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 24.8084 - root_mean_squared_error: 4.9753
 Epoch 5/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 15.6657 - root_mean_squared_error: 3.9543
 Epoch 6/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 12.2236 - root_mean_squared_error: 3.4927
 Epoch 7/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 11.2817 - root_mean_squared_error: 3.3472
 Epoch 8/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.3543 - root_mean_squared_error: 3.5082
 Epoch 9/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.9035 - root_mean_squared_error: 3.5800
 Epoch 10/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.3703 - root_mean_squared_error: 3.3635
 Epoch 11/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.7864 - root_mean_squared_error: 3.5701
 Epoch 12/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.7018 - root_mean_squared_error: 3.5578
 Epoch 13/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.3804 - root_mean_squared_error: 3.5061
 Epoch 14/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.4949 - root_mean_squared_error: 3.3811
 Epoch 15/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 13.1091 - root_mean_squared_error: 3.6081
 Epoch 16/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 11.4416 - root_mean_squared_error: 3.3735
 Epoch 17/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 10.9564 - root_mean_squared_error: 3.3044
 Epoch 18/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.4541 - root_mean_squared_error: 3.5195
 Epoch 19/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.0418 - root_mean_squared_error: 3.3155
 Epoch 20/20
 634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.2511 - root_mean_squared_error: 3.4926

 SUCCESS: training experiment complete

 --------------------------------------------------------------------------------
 |                                  MODEL INFO                                  |
 --------------------------------------------------------------------------------
 Weight for feature[TRIP_MILES]: 2.030
 Weight for feature[TRIP_MINUTES]: 0.142
 Bias: 3.828

 FARE = 2.030 * TRIP_MILES + 0.142 * TRIP_MINUTES + 3.828
diff --git a/output_running_code_for_make_predictions.txt b/output_running_code_for_make_predictions.txt
 --------------------------------------------------------------------------------
 |                                 PREDICTIONS                                  |
 --------------------------------------------------------------------------------
   PREDICTED_FARE OBSERVED_FARE L1_LOSS  TRIP_MILES TRIP_MINUTES
 0          $18.33        $18.25   $0.08        4.90        32.00
 1          $45.11        $43.25   $1.86       17.37        42.30
 2          $31.50        $31.75   $0.25       11.98        23.52
 3          $27.43        $27.00   $0.43        9.77        26.47
 4          $74.23        $76.75   $2.52       31.06        51.58
 5           $7.25         $9.00   $1.75        1.26         6.03
 6           $5.96         $5.75   $0.21        0.70         5.00
 7           $6.95         $9.00   $2.05        1.00         7.65
 8          $33.96        $34.00   $0.04       12.60        32.00
 9          $27.32        $26.00   $1.32        9.21        33.63
 10         $44.20        $43.25   $0.95       17.54        33.48
 11         $45.33        $45.00   $0.33       18.20        32.00
 12         $17.74        $17.25   $0.49        5.20        23.58
 13         $30.70        $30.50   $0.20       11.20        29.00
 14         $43.36        $43.50   $0.14       17.30        31.00
 15         $19.37        $19.25   $0.12        6.81        12.03
 16         $17.82        $17.25   $0.57        5.70        17.00
 17         $16.05        $15.75   $0.30        4.90        16.00
 18          $9.01         $8.25   $0.76        2.20         5.00
 19          $6.10         $5.75   $0.35        0.79         4.68
 20         $17.17        $17.50   $0.33        5.10        21.00
 21          $6.34         $5.75   $0.59        0.85         5.53
 22         $41.13        $43.80   $2.67       14.94        48.95
 23         $40.63        $41.25   $0.62       16.74        19.78
 24         $46.86        $46.50   $0.36       17.90        47.00
 25          $7.06         $6.75   $0.31        1.10         7.00
 26         $21.07        $20.50   $0.57        7.30        17.00
 27          $7.20         $6.75   $0.45        1.13         7.57
 28         $19.21        $19.50   $0.29        6.13        20.65
 29         $41.56        $40.50   $1.06       15.50        44.00
 30         $31.48        $31.75   $0.27       11.62        28.50
 31         $24.60        $24.25   $0.35        8.90        19.00
 32         $17.99        $18.00   $0.01        5.66        18.78
 33         $22.49        $22.50   $0.01        7.70        21.27
 34         $22.16        $21.00   $1.16        6.44        36.92
 35          $9.50         $9.00   $0.50        2.09        10.05
 36         $24.32        $24.50   $0.18        8.50        22.70
 37         $30.02        $30.25   $0.23       11.50        20.00
 38         $43.81        $45.00   $1.19       17.80        27.02
 39          $9.08         $9.00   $0.08        1.80        11.23
 40          $5.62         $5.25   $0.37        0.60         4.00
 41          $6.17         $5.50   $0.67        0.88         3.92
 42         $14.91        $14.50   $0.41        4.22        17.68
 43          $6.36         $5.25   $1.11        0.84         5.82
 44         $15.60        $14.93   $0.67        4.74        15.12
 45         $28.72        $36.00   $7.28        8.06        59.92
 46         $21.79        $20.50   $1.29        7.26        22.63
 47         $12.50        $12.00   $0.50        3.50        11.00
 48          $9.39         $9.00   $0.39        2.27         6.67
 49         $16.36        $15.75   $0.61        5.40        11.00
	#@title Code - Load dependencies

	#general
	import io

	# data
	import numpy as np
	import pandas as pd

	# machine learning
	import keras

	# data visualization
	import plotly.express as px
	from plotly.subplots import make_subplots
	import plotly.graph_objects as go
	import seaborn as sns


	# @title Load the dataset
	chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")


	#@title Code - Read dataset

	# Updates dataframe to use specific columns.
	training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]

	print('Read dataset completed successfully.')
	print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
	training_df.head(200)


	#@title Code - View dataset statistics

	print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
	training_df.describe(include='all')


	#@title Double-click or run to view answers about dataset statistics

	answer = '''
	What is the maximum fare? Answer: $159.25
	What is the mean distance across all trips? Answer: 8.2895 miles
	How many cab companies are in the dataset? Answer: 31
	What is the most frequent payment type? Answer: Credit Card
	Are any features missing data? Answer: No
	'''

	# You should be able to find the answers to the questions about the dataset
	# by inspecting the table output after running the DataFrame describe method.
	#
	# Run this code cell to verify your answers.

	# What is the maximum fare?
	max_fare = training_df['FARE'].max()
	print("What is the maximum fare? \t\t\t\tAnswer: ${fare:.2f}".format(fare = max_fare))

	# What is the mean distance across all trips?
	mean_distance = training_df['TRIP_MILES'].mean()
	print("What is the mean distance across all trips? \t\tAnswer: {mean:.4f} miles".format(mean = mean_distance))

	# How many cab companies are in the dataset?
	num_unique_companies = training_df['COMPANY'].nunique()
	print("How many cab companies are in the dataset? \t\tAnswer: {number}".format(number = num_unique_companies))

	# What is the most frequent payment type?
	most_freq_payment_type = training_df['PAYMENT_TYPE'].value_counts().idxmax()
	print("What is the most frequent payment type? \t\tAnswer: {type}".format(type = most_freq_payment_type))

	# Are any features missing data?
	missing_values = training_df.isnull().sum().sum()
	print("Are any features missing data? \t\t\t\tAnswer:", "No" if missing_values == 0 else "Yes")


	#@title Code - View correlation matrix
	training_df.corr(numeric_only = True)


	#@title Double-click to view answers about the correlation matrix

	# Which feature correlates most strongly to the label FARE?
	# ---------------------------------------------------------
	answer = '''
	The feature with the strongest correlation to the FARE is TRIP_MILES.
	As you might expect, TRIP_MILES looks like a good feature to start with to train
	the model. Also, notice that the feature TRIP_SECONDS has a strong correlation
	with fare too.
	'''
	print(answer)

	# Which feature correlates least strongly to the label FARE?
	# -----------------------------------------------------------
	answer = '''The feature with the weakest correlation to the FARE is TIP_RATE.'''
	print(answer)


	#@title Code - View pairplot
	sns.pairplot(training_df, x_vars=["FARE", "TRIP_MILES", "TRIP_SECONDS"], y_vars=["FARE", "TRIP_MILES", "TRIP_SECONDS"])


	#@title Define plotting functions

	def make_plots(df, feature_names, label_name, model_output, sample_size=200):

	random_sample = df.sample(n=sample_size).copy()
	random_sample.reset_index()
	weights, bias, epochs, rmse = model_output

	is_2d_plot = len(feature_names) == 1
	model_plot_type = "scatter" if is_2d_plot else "surface"
	fig = make_subplots(rows=1, cols=2,
	subplot_titles=("Loss Curve", "Model Plot"),
	specs=[[{"type": "scatter"}, {"type": model_plot_type}]])

	plot_data(random_sample, feature_names, label_name, fig)
	plot_model(random_sample, feature_names, weights, bias, fig)
	plot_loss_curve(epochs, rmse, fig)

	fig.show()
	return

	def plot_loss_curve(epochs, rmse, fig):
	curve = px.line(x=epochs, y=rmse)
	curve.update_traces(line_color='#ff0000', line_width=3)

	fig.append_trace(curve.data[0], row=1, col=1)
	fig.update_xaxes(title_text="Epoch", row=1, col=1)
	fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])

	return

	def plot_data(df, features, label, fig):
	if len(features) == 1:
	scatter = px.scatter(df, x=features[0], y=label)
	else:
	scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)

	fig.append_trace(scatter.data[0], row=1, col=2)
	if len(features) == 1:
	fig.update_xaxes(title_text=features[0], row=1, col=2)
	fig.update_yaxes(title_text=label, row=1, col=2)
	else:
	fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))

	return

	def plot_model(df, features, weights, bias, fig):
	df['FARE_PREDICTED'] = bias[0]

	for index, feature in enumerate(features):
	df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]

	if len(features) == 1:
	model = px.line(df, x=features[0], y='FARE_PREDICTED')
	model.update_traces(line_color='#ff0000', line_width=3)
	else:
	z_name, y_name = "FARE_PREDICTED", features[1]
	z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
	y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
	x = []
	for i in range(len(y)):
	x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])

	plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})

	light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
	model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
	colorscale=light_yellow))

	fig.add_trace(model.data[0], row=1, col=2)

	return

	def model_info(feature_names, label_name, model_output):
	weights = model_output[0]
	bias = model_output[1]

	nl = "\n"
	header = "-" * 80
	banner = header + nl + "\|" + "MODEL INFO".center(78) + "\|" + nl + header

	info = ""
	equation = label_name + " = "

	for index, feature in enumerate(feature_names):
	info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
	equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)

	info = info + "Bias: {:.3f}\n".format(bias[0])
	equation = equation + "{:.3f}\n".format(bias[0])

	return banner + nl + info + nl + equation

	print("SUCCESS: defining plotting functions complete.")


	#@title Code - Define ML functions

	def build_model(my_learning_rate, num_features):
	"""Create and compile a simple linear regression model."""
	# Describe the topography of the model.
	# The topography of a simple linear regression model
	# is a single node in a single layer.
	inputs = keras.Input(shape=(num_features,))
	outputs = keras.layers.Dense(units=1)(inputs)
	model = keras.Model(inputs=inputs, outputs=outputs)

	# Compile the model topography into code that Keras can efficiently
	# execute. Configure training to minimize the model's mean squared error.
	model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=my_learning_rate),
	loss="mean_squared_error",
	metrics=[keras.metrics.RootMeanSquaredError()])

	return model


	def train_model(model, df, features, label, epochs, batch_size):
	"""Train the model by feeding it data."""

	# Feed the model the feature and the label.
	# The model will train for the specified number of epochs.
	# input_x = df.iloc[:,1:3].values
	# df[feature]
	history = model.fit(x=features,
	y=label,
	batch_size=batch_size,
	epochs=epochs)

	# Gather the trained model's weight and bias.
	trained_weight = model.get_weights()[0]
	trained_bias = model.get_weights()[1]

	# The list of epochs is stored separately from the rest of history.
	epochs = history.epoch

	# Isolate the error for each epoch.
	hist = pd.DataFrame(history.history)

	# To track the progression of training, we're going to take a snapshot
	# of the model's root mean squared error at each epoch.
	rmse = hist["root_mean_squared_error"]

	return trained_weight, trained_bias, epochs, rmse


	def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):

	print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))

	num_features = len(feature_names)

	features = df.loc[:, feature_names].values
	label = df[label_name].values

	model = build_model(learning_rate, num_features)
	model_output = train_model(model, df, features, label, epochs, batch_size)

	print('\nSUCCESS: training experiment complete\n')
	print('{}'.format(model_info(feature_names, label_name, model_output)))
	make_plots(df, feature_names, label_name, model_output)

	return model

	print("SUCCESS: defining linear regression functions complete.")


	#@title Code - Experiment 1

	# The following variables are the hyperparameters.
	learning_rate = 0.001
	epochs = 20
	batch_size = 50

	# Specify the feature and the label.
	features = ['TRIP_MILES']
	label = 'FARE'

	model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)


	#@title Double-click to view answers for training model with one feature

	# How many epochs did it take to converge on the final model?
	# -----------------------------------------------------------------------------
	answer = """
	Use the loss curve to see where the loss begins to level off during training.

	With this set of hyperparameters:

	learning_rate = 0.001
	epochs = 20
	batch_size = 50

	it takes about 5 epochs for the training run to converge to the final model.
	"""
	print(answer)

	# How well does the model fit the sample data?
	# -----------------------------------------------------------------------------
	answer = '''
	It appears from the model plot that the model fits the sample data fairly well.
	'''
	print(answer)

	#@title Experiment with hyperparameters

	It is common with machine learning to run multiple experiments to find the best set of hyperparmeters to train your model.
	# -----------------------------------------------------------------------------
	In this step, try varying the hyperparameters one by one with this set of experiments:

	Experiment 1: Increase the learning rate to 1 (batch size at 50).
	Experiment 2: Decrease the learning rate to 0.0001 (batch size at 50).
	Experiment 3: Increase the batch size to 500 (learning rate at 0.001).


	#@title Double-click to view answers for hyperparameter experiments

	# How did raising the learning rate impact your ability to train the model?
	# -----------------------------------------------------------------------------
	answer = """
	When the learning rate is too high, the loss curve bounces around and does not
	appear to be moving towards convergence with each iteration. Also, notice that
	the predicted model does not fit the data very well. With a learning rate that
	is too high, it is unlikely that you will be able to train a model with good
	results.
	"""
	print(answer)

	# How did lowering the learning rate impact your ability to train the model?
	# -----------------------------------------------------------------------------
	answer = '''
	When the learning rate is too small, it may take longer for the loss curve to
	converge. With a small learning rate the loss curve decreases slowly, but does
	not show a dramatic drop or leveling off. With a small learning rate you could
	increase the number of epochs so that your model will eventually converge, but
	it will take longer.
	'''
	print(answer)

	# Did changing the batch size effect your training results?
	# -----------------------------------------------------------------------------
	answer = '''
	Increasing the batch size makes each epoch run faster, but as with the smaller
	learning rate, the model does not converge with just 20 epochs. If you have
	time, try increasing the number of epochs and eventually you should see the
	model converge.
	'''
	print(answer)


	#@title Code - Experiment 3

	# The following variables are the hyperparameters.
	learning_rate = 0.001
	epochs = 20
	batch_size = 50

	training_df.loc[:, 'TRIP_MINUTES'] = training_df['TRIP_SECONDS']/60

	features = ['TRIP_MILES', 'TRIP_MINUTES']
	label = 'FARE'

	model_2 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)


	#@title Double-click to view answers for training with two features

	# Does the model with two features produce better results than one using a
	# single feature?
	# -----------------------------------------------------------------------------
	answer = '''
	To answer this question for your specific training runs, compare the RMSE for
	each model. For example, if the RMSE for the model trained with one feature was
	3.7457 and the RMSE for the model with two features is 3.4787, that means that
	on average the model with two features makes predictions that are about $0.27
	closer to the observed fare.

	'''
	print(answer)

	# Does it make a difference if you use TRIP_SECONDS instead of TRIP_MILES?
	# -----------------------------------------------------------------------------
	answer = '''
	When training a model with more than one feature, it is important that all
	numeric values are roughly on the same scale. In this case, TRIP_SECONDS and
	TRIP_MILES do not meet this criteria. The mean value for TRIP_MILES is 8.3 and
	the mean for TRIP_SECONDS is 1320; that is two orders of magnitude difference.
	Converting the trip duration to minutes helps during training because in puts
	values for both features on a more comparable scale. Of course, this is not the
	only way to scale values before training, but you will learn about that in
	another module.

	'''
	print(answer)

	# How well do you think the model comes to the ground truth fare calculation for
	# Chicago taxi trips?
	# -----------------------------------------------------------------------------
	answer = '''
	In reality, Chicago taxi cabs use a documented formula to determine cab fares.
	For a single passenger paying cash, the fare is calculated like this:

	FARE = 2.25 * TRIP_MILES + 0.12 * TRIP_MINUTES + 3.25

	Typically with machine learning problems you would not know the 'correct'
	formula, but in this case you can this knowledge to evaluate your model. Take a
	look at your model output (the weights and bias) and determine how well it
	matches the ground truth fare calculation. You should find that the model is
	roughly close to this formula.
	'''
	print(answer)


	#@title Code - Define functions to make predictions
	def format_currency(x):
	return "${:.2f}".format(x)

	def build_batch(df, batch_size):
	batch = df.sample(n=batch_size).copy()
	batch.set_index(np.arange(batch_size), inplace=True)
	return batch

	def predict_fare(model, df, features, label, batch_size=50):
	batch = build_batch(df, batch_size)
	predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)

	data = {"PREDICTED_FARE": [], "OBSERVED_FARE": [], "L1_LOSS": [],
	features[0]: [], features[1]: []}
	for i in range(batch_size):
	predicted = predicted_values[i][0]
	observed = batch.at[i, label]
	data["PREDICTED_FARE"].append(format_currency(predicted))
	data["OBSERVED_FARE"].append(format_currency(observed))
	data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
	data[features[0]].append(batch.at[i, features[0]])
	data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))

	output_df = pd.DataFrame(data)
	return output_df

	def show_predictions(output):
	header = "-" * 80
	banner = header + "\n" + "\|" + "PREDICTIONS".center(78) + "\|" + "\n" + header
	print(banner)
	print(output)
	return


	#@title Code - Make predictions

	output = predict_fare(model_2, training_df, features, label)
	show_predictions(output)


	#@title Double-click to view answers for validate model

	# How close is the predicted value to the label value?
	# -----------------------------------------------------------------------------
	answer = '''
	Based on a random sampling of examples, the model seems to do pretty well
	predicting the fare for a taxi ride. Most of the predicted values do not vary
	significantly from the observed value. You should be able to see this by looking
	at the column L1_LOSS = \|observed - predicted\|.
	'''
	print(answer)
	INFO: starting training experiment with features=['TRIP_MILES', 'TRIP_MINUTES'] and label=FARE

	Epoch 1/20
	<ipython-input-19-8892146dfcd1>:8: SettingWithCopyWarning:


	A value is trying to be set on a copy of a slice from a DataFrame.
	Try using .loc[row_indexer,col_indexer] = value instead

	See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3884.8274 - root_mean_squared_error: 62.2386
	Epoch 2/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 1510.9357 - root_mean_squared_error: 38.7442
	Epoch 3/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 273.6273 - root_mean_squared_error: 16.3169
	Epoch 4/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 24.8084 - root_mean_squared_error: 4.9753
	Epoch 5/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 15.6657 - root_mean_squared_error: 3.9543
	Epoch 6/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 12.2236 - root_mean_squared_error: 3.4927
	Epoch 7/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 11.2817 - root_mean_squared_error: 3.3472
	Epoch 8/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.3543 - root_mean_squared_error: 3.5082
	Epoch 9/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.9035 - root_mean_squared_error: 3.5800
	Epoch 10/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.3703 - root_mean_squared_error: 3.3635
	Epoch 11/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.7864 - root_mean_squared_error: 3.5701
	Epoch 12/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.7018 - root_mean_squared_error: 3.5578
	Epoch 13/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.3804 - root_mean_squared_error: 3.5061
	Epoch 14/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.4949 - root_mean_squared_error: 3.3811
	Epoch 15/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 13.1091 - root_mean_squared_error: 3.6081
	Epoch 16/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 11.4416 - root_mean_squared_error: 3.3735
	Epoch 17/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 10.9564 - root_mean_squared_error: 3.3044
	Epoch 18/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.4541 - root_mean_squared_error: 3.5195
	Epoch 19/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11.0418 - root_mean_squared_error: 3.3155
	Epoch 20/20
	634/634 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12.2511 - root_mean_squared_error: 3.4926

	SUCCESS: training experiment complete

	--------------------------------------------------------------------------------
	\| MODEL INFO \|
	--------------------------------------------------------------------------------
	Weight for feature[TRIP_MILES]: 2.030
	Weight for feature[TRIP_MINUTES]: 0.142
	Bias: 3.828

	FARE = 2.030 * TRIP_MILES + 0.142 * TRIP_MINUTES + 3.828
	--------------------------------------------------------------------------------
	\| PREDICTIONS \|
	--------------------------------------------------------------------------------
	PREDICTED_FARE OBSERVED_FARE L1_LOSS TRIP_MILES TRIP_MINUTES
	0 $18.33 $18.25 $0.08 4.90 32.00
	1 $45.11 $43.25 $1.86 17.37 42.30
	2 $31.50 $31.75 $0.25 11.98 23.52
	3 $27.43 $27.00 $0.43 9.77 26.47
	4 $74.23 $76.75 $2.52 31.06 51.58
	5 $7.25 $9.00 $1.75 1.26 6.03
	6 $5.96 $5.75 $0.21 0.70 5.00
	7 $6.95 $9.00 $2.05 1.00 7.65
	8 $33.96 $34.00 $0.04 12.60 32.00
	9 $27.32 $26.00 $1.32 9.21 33.63
	10 $44.20 $43.25 $0.95 17.54 33.48
	11 $45.33 $45.00 $0.33 18.20 32.00
	12 $17.74 $17.25 $0.49 5.20 23.58
	13 $30.70 $30.50 $0.20 11.20 29.00
	14 $43.36 $43.50 $0.14 17.30 31.00
	15 $19.37 $19.25 $0.12 6.81 12.03
	16 $17.82 $17.25 $0.57 5.70 17.00
	17 $16.05 $15.75 $0.30 4.90 16.00
	18 $9.01 $8.25 $0.76 2.20 5.00
	19 $6.10 $5.75 $0.35 0.79 4.68
	20 $17.17 $17.50 $0.33 5.10 21.00
	21 $6.34 $5.75 $0.59 0.85 5.53
	22 $41.13 $43.80 $2.67 14.94 48.95
	23 $40.63 $41.25 $0.62 16.74 19.78
	24 $46.86 $46.50 $0.36 17.90 47.00
	25 $7.06 $6.75 $0.31 1.10 7.00
	26 $21.07 $20.50 $0.57 7.30 17.00
	27 $7.20 $6.75 $0.45 1.13 7.57
	28 $19.21 $19.50 $0.29 6.13 20.65
	29 $41.56 $40.50 $1.06 15.50 44.00
	30 $31.48 $31.75 $0.27 11.62 28.50
	31 $24.60 $24.25 $0.35 8.90 19.00
	32 $17.99 $18.00 $0.01 5.66 18.78
	33 $22.49 $22.50 $0.01 7.70 21.27
	34 $22.16 $21.00 $1.16 6.44 36.92
	35 $9.50 $9.00 $0.50 2.09 10.05
	36 $24.32 $24.50 $0.18 8.50 22.70
	37 $30.02 $30.25 $0.23 11.50 20.00
	38 $43.81 $45.00 $1.19 17.80 27.02
	39 $9.08 $9.00 $0.08 1.80 11.23
	40 $5.62 $5.25 $0.37 0.60 4.00
	41 $6.17 $5.50 $0.67 0.88 3.92
	42 $14.91 $14.50 $0.41 4.22 17.68
	43 $6.36 $5.25 $1.11 0.84 5.82
	44 $15.60 $14.93 $0.67 4.74 15.12
	45 $28.72 $36.00 $7.28 8.06 59.92
	46 $21.79 $20.50 $1.29 7.26 22.63
	47 $12.50 $12.00 $0.50 3.50 11.00
	48 $9.39 $9.00 $0.39 2.27 6.67
	49 $16.36 $15.75 $0.61 5.40 11.00