stephenleo · April 25, 2022 18:29
diff --git a/01 NYC Taxi Analysis.md b/01 NYC Taxi Analysis.md
diff --git a/02_data_query.py b/02_data_query.py
 # Imports. Run in Google Data Lab
 import google.datalab.bigquery as bq
 import pandas as pd
 
 # Create and run a SQL query for the taxi_data from 2015 and 2016
 query = bq.Query('(SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2015` LIMIT 100000) \
                   union all (SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2016` LIMIT 100000)')
 output_options = bq.QueryOutput.table(use_cache=False)
 result = query.execute(output_options=output_options).result()
 
 # Convert to DataFrame
 df = result.to_dataframe()
 
 # Write to csv
 df.to_csv('data.csv')
diff --git a/03_imports.py b/03_imports.py
 # Imports
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 %matplotlib inline

 # Load data to dataframe and remove dummy column 'Unnamed: 0'
 df = pd.read_csv('data.csv')
 df.drop(['Unnamed: 0'], axis = 1, inplace = True)
 df.head()
diff --git a/04_data.md b/04_data.md
diff --git a/05_trip_duration.py b/05_trip_duration.py
 # Calculate trip_duration in minutes
 df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
 df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
 df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60
diff --git a/06_simple_model_features.py b/06_simple_model_features.py
 # Visualise the columns to be used for simple intuitive model
 pd.plotting.scatter_matrix(frame = df[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7));
diff --git a/07_simple_model_features_cleanup.py b/07_simple_model_features_cleanup.py
 # Remove the outliers
 df_filtered = df[(df['trip_distance'] >= 2) & (df['trip_distance'] <= 50) &\
                 (df['fare_amount'] >= 3) & (df['fare_amount'] <=300) & \
                 (df['trip_duration'] <= 200) & (df['trip_duration'] > 1)].copy()

 # Visualise the columns to be used for simple intuitive model
 pd.plotting.scatter_matrix(frame = df_filtered[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7));
diff --git a/08_helper_functions.py b/08_helper_functions.py
 # Define some helper functions
 from sklearn.metrics import mean_squared_error, r2_score

 def train_test_split(X, y):
    """Split X and y into training set and testing set.
    Data from year = 2015 is used as training set while data from year = 2016 is used as testing set.
    Returns X_train, y_train, X_test, y_test
    """
    
    X_train = X.loc[X['year'] == 2015].drop('year', axis=1)
    y_train = y.loc[y['year'] == 2015].drop('year', axis=1).values.ravel()
    X_test = X.loc[X['year'] == 2016].drop('year', axis=1)
    y_test = y.loc[y['year'] == 2016].drop('year', axis=1).values.ravel()
    return X_train, y_train, X_test, y_test
  
 def model_results(X_train, y_train, X_test, y_test, model):
    """Print model parameters of RMSE and R-square on training and testing sets.
    """
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("----Training Data results (2015 data set)----")
    print("RMSE: ${:.1f}".format(mean_squared_error(y_train, y_train_pred)**0.5))
    print("R2: {:.2f}\n".format(r2_score(y_train, y_train_pred)))
    
    print("----Test Data results (2016 data set)----")
    print("RMSE: ${:.1f}".format(mean_squared_error(y_test, y_test_pred)**0.5))
    print("R2: {:.2f}\n".format(r2_score(y_test, y_test_pred)))
diff --git a/09_baseline_model.py b/09_baseline_model.py
 # Create a column with the year information
 df_filtered['year'] = df_filtered['pickup_datetime'].dt.year

 # Split response (y) and features (X)
 y = df_filtered[['fare_amount', 'year']]
 X = df_filtered[['trip_distance', 'trip_duration', 'year']]

 # Create Train and Test sets
 X_train, y_train, X_test, y_test = train_test_split(X, y)

 # Simple bivariate model
 from sklearn.linear_model import LinearRegression

 lm = LinearRegression().fit(X_train,y_train)
 model_results(X_train, y_train, X_test, y_test, lm)
diff --git a/10_feature_engineering.py b/10_feature_engineering.py
 # Create new features as mentioned above
 import pygeohash as gh

 df_filtered['day'] = df_filtered['pickup_datetime'].dt.day
 df_filtered['month'] = df_filtered['pickup_datetime'].dt.month
 df_filtered['day_of_week'] = df_filtered['pickup_datetime'].dt.weekday_name
 df_filtered['hour_of_day'] = df_filtered['pickup_datetime'].dt.hour
 df_filtered['lat_dif'] = df_filtered['pickup_latitude'] - df_filtered['dropoff_latitude']
 df_filtered['lon_dif'] = df_filtered['pickup_longitude'] - df_filtered['dropoff_longitude']
 df_filtered['pickup_geohash']=df_filtered.apply(lambda x: gh.encode(x.pickup_latitude, x.pickup_longitude, precision=5), axis=1)
 df_filtered['dropoff_geohash']=df_filtered.apply(lambda x: gh.encode(x.dropoff_latitude, x.dropoff_longitude, precision=5), axis=1)

 # Check for any missing data
 df_filtered.isna().sum()

 # Remove null values from lattitude and longitude
 df_filtered = df_filtered[df_filtered['lat_dif'].notnull() & df_filtered['lon_dif'].notnull()].copy()
 df_filtered[['lat_dif', 'lon_dif']].hist();
diff --git a/11_feature_engineered_features.py b/11_feature_engineered_features.py
 # Remove outliers in lat_dif and lon_dif
 df_filtered = df_filtered[(df_filtered['lat_dif'] < 1) & (df_filtered['lat_dif'] > -1) & \
                          (df_filtered['lon_dif'] < 1) & (df_filtered['lon_dif'] > -1)]

 # Split response (y) and features (X)
 y = df_filtered[['fare_amount', 'year']]
 X = df_filtered[['passenger_count', 'trip_distance', 'trip_duration', 'year', 'month', 'day_of_week', \
                 'hour_of_day', 'lat_dif', 'lon_dif', 'pickup_geohash', 'dropoff_geohash']]

 # Encode the categorical variables
 X_encoded = pd.get_dummies(X, columns=['month', 'day_of_week', 'hour_of_day', 'pickup_geohash', 'dropoff_geohash'])

 # Check distribution of features
 X.hist(figsize = (10,10));
diff --git a/12_lr_model.py b/12_lr_model.py
 # Split Training and Testing sets
 X_train, y_train, X_test, y_test = train_test_split(X_encoded, y)

 # Multivariate Linear Model
 lm = LinearRegression().fit(X_train,y_train)
 model_results(X_train, y_train, X_test, y_test, lm)
diff --git a/13_lasso_model.py b/13_lasso_model.py
 # Lasso Regression to regularize
 from sklearn.linear_model import Lasso

 las = Lasso().fit(X_train,y_train)
 model_results(X_train, y_train, X_test, y_test, las)
diff --git a/14_lasso_gridcv.py b/14_lasso_gridcv.py
 # Lasso Regression Hyperparameter tuning
 import time

 alphas = np.logspace(-4, -0.5, 30)
 n_folds = 5

 lasso_params = {'alpha': alphas}

 # GridSearch CV for hyperparameter tuning
 from sklearn.model_selection import GridSearchCV

 start = time.time()
 las_tuned = GridSearchCV(Lasso(max_iter = 10000, random_state = 0), lasso_params, cv = n_folds)
 las_tuned.fit(X_train, y_train)
 end = time.time()
 print('GridSearchCV execution time: {}'.format(end - start))

 print('Lasso best params: {}\n'.format(las_tuned.best_params_))
 model_results(X_train, y_train, X_test, y_test, las_tuned)
diff --git a/15_lasso_cv.py b/15_lasso_cv.py
 # LassoCV for hyperparameter tuning
 from sklearn.linear_model import LassoCV

 start = time.time()
 las_cv = LassoCV(cv = n_folds, alphas = alphas, max_iter = 10000, random_state = 0)
 las_cv.fit(X_train,y_train)
 end = time.time()
 print('LassoCV execution time: {}'.format(end - start))

 print('LassoCV best params: {}\n'.format(las_cv.alpha_))
 model_results(X_train, y_train, X_test, y_test, las_cv)
diff --git a/16_lasso_cv_plot.py b/16_lasso_cv_plot.py
 # Plot the Actual vs Predicted to visually check the correlation
 y_train_pred = las_cv.predict(X_train)
 y_test_pred = las_cv.predict(X_test)
 test_data_fit = LinearRegression().fit(y_test_pred.reshape(-1, 1),y_test)
 slope = test_data_fit.coef_[0]
 intercept = test_data_fit.intercept_
 line = slope * y_test_pred + intercept

 #plt.scatter(y_train_pred, y_train, alpha = 0.2, label = 'Training Data (2015)')
 plt.scatter(y_test_pred, y_test, alpha = 0.5, label = 'Test Data (2016)')
 plt.plot(y_test_pred, line, 'r', label='y = ({:.2f}) x + ({:.2f})'.format(slope,intercept))
 plt.legend(loc = 'lower right')
 plt.xlabel('Predicted Fare ($)')
 plt.ylabel('Actual Fare ($)');
	# Imports. Run in Google Data Lab
	import google.datalab.bigquery as bq
	import pandas as pd

	# Create and run a SQL query for the taxi_data from 2015 and 2016
	query = bq.Query('(SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2015` LIMIT 100000) \
	union all (SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2016` LIMIT 100000)')
	output_options = bq.QueryOutput.table(use_cache=False)
	result = query.execute(output_options=output_options).result()

	# Convert to DataFrame
	df = result.to_dataframe()

	# Write to csv
	df.to_csv('data.csv')
	# Imports
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	%matplotlib inline

	# Load data to dataframe and remove dummy column 'Unnamed: 0'
	df = pd.read_csv('data.csv')
	df.drop(['Unnamed: 0'], axis = 1, inplace = True)
	df.head()
vendor_id	pickup_datetime	dropoff_datetime	passenger_count	trip_distance	pickup_longitude	pickup_latitude	rate_code	store_and_fwd_flag	dropoff_longitude	dropoff_latitude	payment_type	fare_amount	mta_tax	tip_amount	tolls_amount	imp_surcharge	total_amount
2	2015-07-18 11:25:58	2015-07-18 11:43:47	1	7.21	-73.862762	40.769028	1.0	N	-73.949203	40.722584	1	22.5	0.5	4.66	0.00	0.3	27.96
1	2015-03-15 12:50:01	2015-03-15 13:23:35	1	10.80	-73.870926	40.773727	NaN	N	-73.988228	40.765694	1	34.5	0.5	8.10	5.33	0.3	48.73
2	2015-04-30 12:25:44	2015-04-30 13:03:51	1	4.28	-73.978180	40.762341	NaN	N	-74.008911	40.710789	1	24.5	0.5	2.50	0.00	0.3	27.80
2	2015-05-28 08:47:56	2015-05-28 09:26:08	1	18.47	-73.776711	40.645302	NaN	N	-73.843422	40.852852	1	51.0	0.5	7.00	5.54	0.3	64.34
1	2015-06-20 19:36:17	2015-06-20 20:10:49	1	15.50	-73.777054	40.644947	NaN	Y	-73.946800	40.725021	1	44.5	0.5	9.06	0.00	0.3	54.36
	# Calculate trip_duration in minutes
	df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
	df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
	df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60
	# Visualise the columns to be used for simple intuitive model
	pd.plotting.scatter_matrix(frame = df[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7));
	# Remove the outliers
	df_filtered = df[(df['trip_distance'] >= 2) & (df['trip_distance'] <= 50) &\
	(df['fare_amount'] >= 3) & (df['fare_amount'] <=300) & \
	(df['trip_duration'] <= 200) & (df['trip_duration'] > 1)].copy()

	# Visualise the columns to be used for simple intuitive model
	pd.plotting.scatter_matrix(frame = df_filtered[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7));
	# Define some helper functions
	from sklearn.metrics import mean_squared_error, r2_score

	def train_test_split(X, y):
	"""Split X and y into training set and testing set.
	Data from year = 2015 is used as training set while data from year = 2016 is used as testing set.
	Returns X_train, y_train, X_test, y_test
	"""

	X_train = X.loc[X['year'] == 2015].drop('year', axis=1)
	y_train = y.loc[y['year'] == 2015].drop('year', axis=1).values.ravel()
	X_test = X.loc[X['year'] == 2016].drop('year', axis=1)
	y_test = y.loc[y['year'] == 2016].drop('year', axis=1).values.ravel()
	return X_train, y_train, X_test, y_test

	def model_results(X_train, y_train, X_test, y_test, model):
	"""Print model parameters of RMSE and R-square on training and testing sets.
	"""

	y_train_pred = model.predict(X_train)
	y_test_pred = model.predict(X_test)
	print("----Training Data results (2015 data set)----")
	print("RMSE: ${:.1f}".format(mean_squared_error(y_train, y_train_pred)**0.5))
	print("R2: {:.2f}\n".format(r2_score(y_train, y_train_pred)))

	print("----Test Data results (2016 data set)----")
	print("RMSE: ${:.1f}".format(mean_squared_error(y_test, y_test_pred)**0.5))
	print("R2: {:.2f}\n".format(r2_score(y_test, y_test_pred)))
	# Create a column with the year information
	df_filtered['year'] = df_filtered['pickup_datetime'].dt.year

	# Split response (y) and features (X)
	y = df_filtered[['fare_amount', 'year']]
	X = df_filtered[['trip_distance', 'trip_duration', 'year']]

	# Create Train and Test sets
	X_train, y_train, X_test, y_test = train_test_split(X, y)

	# Simple bivariate model
	from sklearn.linear_model import LinearRegression

	lm = LinearRegression().fit(X_train,y_train)
	model_results(X_train, y_train, X_test, y_test, lm)
	# Create new features as mentioned above
	import pygeohash as gh

	df_filtered['day'] = df_filtered['pickup_datetime'].dt.day
	df_filtered['month'] = df_filtered['pickup_datetime'].dt.month
	df_filtered['day_of_week'] = df_filtered['pickup_datetime'].dt.weekday_name
	df_filtered['hour_of_day'] = df_filtered['pickup_datetime'].dt.hour
	df_filtered['lat_dif'] = df_filtered['pickup_latitude'] - df_filtered['dropoff_latitude']
	df_filtered['lon_dif'] = df_filtered['pickup_longitude'] - df_filtered['dropoff_longitude']
	df_filtered['pickup_geohash']=df_filtered.apply(lambda x: gh.encode(x.pickup_latitude, x.pickup_longitude, precision=5), axis=1)
	df_filtered['dropoff_geohash']=df_filtered.apply(lambda x: gh.encode(x.dropoff_latitude, x.dropoff_longitude, precision=5), axis=1)

	# Check for any missing data
	df_filtered.isna().sum()

	# Remove null values from lattitude and longitude
	df_filtered = df_filtered[df_filtered['lat_dif'].notnull() & df_filtered['lon_dif'].notnull()].copy()
	df_filtered[['lat_dif', 'lon_dif']].hist();
	# Remove outliers in lat_dif and lon_dif
	df_filtered = df_filtered[(df_filtered['lat_dif'] < 1) & (df_filtered['lat_dif'] > -1) & \
	(df_filtered['lon_dif'] < 1) & (df_filtered['lon_dif'] > -1)]

	# Split response (y) and features (X)
	y = df_filtered[['fare_amount', 'year']]
	X = df_filtered[['passenger_count', 'trip_distance', 'trip_duration', 'year', 'month', 'day_of_week', \
	'hour_of_day', 'lat_dif', 'lon_dif', 'pickup_geohash', 'dropoff_geohash']]

	# Encode the categorical variables
	X_encoded = pd.get_dummies(X, columns=['month', 'day_of_week', 'hour_of_day', 'pickup_geohash', 'dropoff_geohash'])

	# Check distribution of features
	X.hist(figsize = (10,10));