Skip to content

Instantly share code, notes, and snippets.

@stephenleo
Last active April 25, 2022 18:29
Show Gist options
  • Select an option

  • Save stephenleo/731550f10522835467b7079de3384932 to your computer and use it in GitHub Desktop.

Select an option

Save stephenleo/731550f10522835467b7079de3384932 to your computer and use it in GitHub Desktop.
[Medium] New York Taxi data set analysis

NYC Taxi Analysis Medium Post

All the code required from NYC Taxi Analysis Medium Post: Link

# Imports. Run in Google Data Lab
import google.datalab.bigquery as bq
import pandas as pd
# Create and run a SQL query for the taxi_data from 2015 and 2016
query = bq.Query('(SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2015` LIMIT 100000) \
union all (SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2016` LIMIT 100000)')
output_options = bq.QueryOutput.table(use_cache=False)
result = query.execute(output_options=output_options).result()
# Convert to DataFrame
df = result.to_dataframe()
# Write to csv
df.to_csv('data.csv')
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load data to dataframe and remove dummy column 'Unnamed: 0'
df = pd.read_csv('data.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.head()
vendor_id pickup_datetime dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude rate_code store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount imp_surcharge total_amount
2 2015-07-18 11:25:58 2015-07-18 11:43:47 1 7.21 -73.862762 40.769028 1.0 N -73.949203 40.722584 1 22.5 0.0 0.5 4.66 0.00 0.3 27.96
1 2015-03-15 12:50:01 2015-03-15 13:23:35 1 10.80 -73.870926 40.773727 NaN N -73.988228 40.765694 1 34.5 0.0 0.5 8.10 5.33 0.3 48.73
2 2015-04-30 12:25:44 2015-04-30 13:03:51 1 4.28 -73.978180 40.762341 NaN N -74.008911 40.710789 1 24.5 0.0 0.5 2.50 0.00 0.3 27.80
2 2015-05-28 08:47:56 2015-05-28 09:26:08 1 18.47 -73.776711 40.645302 NaN N -73.843422 40.852852 1 51.0 0.0 0.5 7.00 5.54 0.3 64.34
1 2015-06-20 19:36:17 2015-06-20 20:10:49 1 15.50 -73.777054 40.644947 NaN Y -73.946800 40.725021 1 44.5 0.0 0.5 9.06 0.00 0.3 54.36
# Calculate trip_duration in minutes
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60
# Visualise the columns to be used for simple intuitive model
pd.plotting.scatter_matrix(frame = df[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7));
# Remove the outliers
df_filtered = df[(df['trip_distance'] >= 2) & (df['trip_distance'] <= 50) &\
(df['fare_amount'] >= 3) & (df['fare_amount'] <=300) & \
(df['trip_duration'] <= 200) & (df['trip_duration'] > 1)].copy()
# Visualise the columns to be used for simple intuitive model
pd.plotting.scatter_matrix(frame = df_filtered[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7));
# Define some helper functions
from sklearn.metrics import mean_squared_error, r2_score
def train_test_split(X, y):
"""Split X and y into training set and testing set.
Data from year = 2015 is used as training set while data from year = 2016 is used as testing set.
Returns X_train, y_train, X_test, y_test
"""
X_train = X.loc[X['year'] == 2015].drop('year', axis=1)
y_train = y.loc[y['year'] == 2015].drop('year', axis=1).values.ravel()
X_test = X.loc[X['year'] == 2016].drop('year', axis=1)
y_test = y.loc[y['year'] == 2016].drop('year', axis=1).values.ravel()
return X_train, y_train, X_test, y_test
def model_results(X_train, y_train, X_test, y_test, model):
"""Print model parameters of RMSE and R-square on training and testing sets.
"""
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("----Training Data results (2015 data set)----")
print("RMSE: ${:.1f}".format(mean_squared_error(y_train, y_train_pred)**0.5))
print("R2: {:.2f}\n".format(r2_score(y_train, y_train_pred)))
print("----Test Data results (2016 data set)----")
print("RMSE: ${:.1f}".format(mean_squared_error(y_test, y_test_pred)**0.5))
print("R2: {:.2f}\n".format(r2_score(y_test, y_test_pred)))
# Create a column with the year information
df_filtered['year'] = df_filtered['pickup_datetime'].dt.year
# Split response (y) and features (X)
y = df_filtered[['fare_amount', 'year']]
X = df_filtered[['trip_distance', 'trip_duration', 'year']]
# Create Train and Test sets
X_train, y_train, X_test, y_test = train_test_split(X, y)
# Simple bivariate model
from sklearn.linear_model import LinearRegression
lm = LinearRegression().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, lm)
# Create new features as mentioned above
import pygeohash as gh
df_filtered['day'] = df_filtered['pickup_datetime'].dt.day
df_filtered['month'] = df_filtered['pickup_datetime'].dt.month
df_filtered['day_of_week'] = df_filtered['pickup_datetime'].dt.weekday_name
df_filtered['hour_of_day'] = df_filtered['pickup_datetime'].dt.hour
df_filtered['lat_dif'] = df_filtered['pickup_latitude'] - df_filtered['dropoff_latitude']
df_filtered['lon_dif'] = df_filtered['pickup_longitude'] - df_filtered['dropoff_longitude']
df_filtered['pickup_geohash']=df_filtered.apply(lambda x: gh.encode(x.pickup_latitude, x.pickup_longitude, precision=5), axis=1)
df_filtered['dropoff_geohash']=df_filtered.apply(lambda x: gh.encode(x.dropoff_latitude, x.dropoff_longitude, precision=5), axis=1)
# Check for any missing data
df_filtered.isna().sum()
# Remove null values from lattitude and longitude
df_filtered = df_filtered[df_filtered['lat_dif'].notnull() & df_filtered['lon_dif'].notnull()].copy()
df_filtered[['lat_dif', 'lon_dif']].hist();
# Remove outliers in lat_dif and lon_dif
df_filtered = df_filtered[(df_filtered['lat_dif'] < 1) & (df_filtered['lat_dif'] > -1) & \
(df_filtered['lon_dif'] < 1) & (df_filtered['lon_dif'] > -1)]
# Split response (y) and features (X)
y = df_filtered[['fare_amount', 'year']]
X = df_filtered[['passenger_count', 'trip_distance', 'trip_duration', 'year', 'month', 'day_of_week', \
'hour_of_day', 'lat_dif', 'lon_dif', 'pickup_geohash', 'dropoff_geohash']]
# Encode the categorical variables
X_encoded = pd.get_dummies(X, columns=['month', 'day_of_week', 'hour_of_day', 'pickup_geohash', 'dropoff_geohash'])
# Check distribution of features
X.hist(figsize = (10,10));
# Split Training and Testing sets
X_train, y_train, X_test, y_test = train_test_split(X_encoded, y)
# Multivariate Linear Model
lm = LinearRegression().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, lm)
# Lasso Regression to regularize
from sklearn.linear_model import Lasso
las = Lasso().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, las)
# Lasso Regression Hyperparameter tuning
import time
alphas = np.logspace(-4, -0.5, 30)
n_folds = 5
lasso_params = {'alpha': alphas}
# GridSearch CV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
start = time.time()
las_tuned = GridSearchCV(Lasso(max_iter = 10000, random_state = 0), lasso_params, cv = n_folds)
las_tuned.fit(X_train, y_train)
end = time.time()
print('GridSearchCV execution time: {}'.format(end - start))
print('Lasso best params: {}\n'.format(las_tuned.best_params_))
model_results(X_train, y_train, X_test, y_test, las_tuned)
# LassoCV for hyperparameter tuning
from sklearn.linear_model import LassoCV
start = time.time()
las_cv = LassoCV(cv = n_folds, alphas = alphas, max_iter = 10000, random_state = 0)
las_cv.fit(X_train,y_train)
end = time.time()
print('LassoCV execution time: {}'.format(end - start))
print('LassoCV best params: {}\n'.format(las_cv.alpha_))
model_results(X_train, y_train, X_test, y_test, las_cv)
# Plot the Actual vs Predicted to visually check the correlation
y_train_pred = las_cv.predict(X_train)
y_test_pred = las_cv.predict(X_test)
test_data_fit = LinearRegression().fit(y_test_pred.reshape(-1, 1),y_test)
slope = test_data_fit.coef_[0]
intercept = test_data_fit.intercept_
line = slope * y_test_pred + intercept
#plt.scatter(y_train_pred, y_train, alpha = 0.2, label = 'Training Data (2015)')
plt.scatter(y_test_pred, y_test, alpha = 0.5, label = 'Test Data (2016)')
plt.plot(y_test_pred, line, 'r', label='y = ({:.2f}) x + ({:.2f})'.format(slope,intercept))
plt.legend(loc = 'lower right')
plt.xlabel('Predicted Fare ($)')
plt.ylabel('Actual Fare ($)');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment