All the code required from NYC Taxi Analysis Medium Post: Link
Last active
April 25, 2022 18:29
-
-
Save stephenleo/731550f10522835467b7079de3384932 to your computer and use it in GitHub Desktop.
[Medium] New York Taxi data set analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Imports. Run in Google Data Lab | |
| import google.datalab.bigquery as bq | |
| import pandas as pd | |
| # Create and run a SQL query for the taxi_data from 2015 and 2016 | |
| query = bq.Query('(SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2015` LIMIT 100000) \ | |
| union all (SELECT * FROM `bigquery-public-data.new_york.tlc_yellow_trips_2016` LIMIT 100000)') | |
| output_options = bq.QueryOutput.table(use_cache=False) | |
| result = query.execute(output_options=output_options).result() | |
| # Convert to DataFrame | |
| df = result.to_dataframe() | |
| # Write to csv | |
| df.to_csv('data.csv') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Imports | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| %matplotlib inline | |
| # Load data to dataframe and remove dummy column 'Unnamed: 0' | |
| df = pd.read_csv('data.csv') | |
| df.drop(['Unnamed: 0'], axis = 1, inplace = True) | |
| df.head() |
| vendor_id | pickup_datetime | dropoff_datetime | passenger_count | trip_distance | pickup_longitude | pickup_latitude | rate_code | store_and_fwd_flag | dropoff_longitude | dropoff_latitude | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | imp_surcharge | total_amount |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 2015-07-18 11:25:58 | 2015-07-18 11:43:47 | 1 | 7.21 | -73.862762 | 40.769028 | 1.0 | N | -73.949203 | 40.722584 | 1 | 22.5 | 0.0 | 0.5 | 4.66 | 0.00 | 0.3 | 27.96 |
| 1 | 2015-03-15 12:50:01 | 2015-03-15 13:23:35 | 1 | 10.80 | -73.870926 | 40.773727 | NaN | N | -73.988228 | 40.765694 | 1 | 34.5 | 0.0 | 0.5 | 8.10 | 5.33 | 0.3 | 48.73 |
| 2 | 2015-04-30 12:25:44 | 2015-04-30 13:03:51 | 1 | 4.28 | -73.978180 | 40.762341 | NaN | N | -74.008911 | 40.710789 | 1 | 24.5 | 0.0 | 0.5 | 2.50 | 0.00 | 0.3 | 27.80 |
| 2 | 2015-05-28 08:47:56 | 2015-05-28 09:26:08 | 1 | 18.47 | -73.776711 | 40.645302 | NaN | N | -73.843422 | 40.852852 | 1 | 51.0 | 0.0 | 0.5 | 7.00 | 5.54 | 0.3 | 64.34 |
| 1 | 2015-06-20 19:36:17 | 2015-06-20 20:10:49 | 1 | 15.50 | -73.777054 | 40.644947 | NaN | Y | -73.946800 | 40.725021 | 1 | 44.5 | 0.0 | 0.5 | 9.06 | 0.00 | 0.3 | 54.36 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Calculate trip_duration in minutes | |
| df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']) | |
| df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime']) | |
| df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds()/60 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Visualise the columns to be used for simple intuitive model | |
| pd.plotting.scatter_matrix(frame = df[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Remove the outliers | |
| df_filtered = df[(df['trip_distance'] >= 2) & (df['trip_distance'] <= 50) &\ | |
| (df['fare_amount'] >= 3) & (df['fare_amount'] <=300) & \ | |
| (df['trip_duration'] <= 200) & (df['trip_duration'] > 1)].copy() | |
| # Visualise the columns to be used for simple intuitive model | |
| pd.plotting.scatter_matrix(frame = df_filtered[['fare_amount', 'trip_distance', 'trip_duration']], figsize=(7,7)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Define some helper functions | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| def train_test_split(X, y): | |
| """Split X and y into training set and testing set. | |
| Data from year = 2015 is used as training set while data from year = 2016 is used as testing set. | |
| Returns X_train, y_train, X_test, y_test | |
| """ | |
| X_train = X.loc[X['year'] == 2015].drop('year', axis=1) | |
| y_train = y.loc[y['year'] == 2015].drop('year', axis=1).values.ravel() | |
| X_test = X.loc[X['year'] == 2016].drop('year', axis=1) | |
| y_test = y.loc[y['year'] == 2016].drop('year', axis=1).values.ravel() | |
| return X_train, y_train, X_test, y_test | |
| def model_results(X_train, y_train, X_test, y_test, model): | |
| """Print model parameters of RMSE and R-square on training and testing sets. | |
| """ | |
| y_train_pred = model.predict(X_train) | |
| y_test_pred = model.predict(X_test) | |
| print("----Training Data results (2015 data set)----") | |
| print("RMSE: ${:.1f}".format(mean_squared_error(y_train, y_train_pred)**0.5)) | |
| print("R2: {:.2f}\n".format(r2_score(y_train, y_train_pred))) | |
| print("----Test Data results (2016 data set)----") | |
| print("RMSE: ${:.1f}".format(mean_squared_error(y_test, y_test_pred)**0.5)) | |
| print("R2: {:.2f}\n".format(r2_score(y_test, y_test_pred))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Create a column with the year information | |
| df_filtered['year'] = df_filtered['pickup_datetime'].dt.year | |
| # Split response (y) and features (X) | |
| y = df_filtered[['fare_amount', 'year']] | |
| X = df_filtered[['trip_distance', 'trip_duration', 'year']] | |
| # Create Train and Test sets | |
| X_train, y_train, X_test, y_test = train_test_split(X, y) | |
| # Simple bivariate model | |
| from sklearn.linear_model import LinearRegression | |
| lm = LinearRegression().fit(X_train,y_train) | |
| model_results(X_train, y_train, X_test, y_test, lm) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Create new features as mentioned above | |
| import pygeohash as gh | |
| df_filtered['day'] = df_filtered['pickup_datetime'].dt.day | |
| df_filtered['month'] = df_filtered['pickup_datetime'].dt.month | |
| df_filtered['day_of_week'] = df_filtered['pickup_datetime'].dt.weekday_name | |
| df_filtered['hour_of_day'] = df_filtered['pickup_datetime'].dt.hour | |
| df_filtered['lat_dif'] = df_filtered['pickup_latitude'] - df_filtered['dropoff_latitude'] | |
| df_filtered['lon_dif'] = df_filtered['pickup_longitude'] - df_filtered['dropoff_longitude'] | |
| df_filtered['pickup_geohash']=df_filtered.apply(lambda x: gh.encode(x.pickup_latitude, x.pickup_longitude, precision=5), axis=1) | |
| df_filtered['dropoff_geohash']=df_filtered.apply(lambda x: gh.encode(x.dropoff_latitude, x.dropoff_longitude, precision=5), axis=1) | |
| # Check for any missing data | |
| df_filtered.isna().sum() | |
| # Remove null values from lattitude and longitude | |
| df_filtered = df_filtered[df_filtered['lat_dif'].notnull() & df_filtered['lon_dif'].notnull()].copy() | |
| df_filtered[['lat_dif', 'lon_dif']].hist(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Remove outliers in lat_dif and lon_dif | |
| df_filtered = df_filtered[(df_filtered['lat_dif'] < 1) & (df_filtered['lat_dif'] > -1) & \ | |
| (df_filtered['lon_dif'] < 1) & (df_filtered['lon_dif'] > -1)] | |
| # Split response (y) and features (X) | |
| y = df_filtered[['fare_amount', 'year']] | |
| X = df_filtered[['passenger_count', 'trip_distance', 'trip_duration', 'year', 'month', 'day_of_week', \ | |
| 'hour_of_day', 'lat_dif', 'lon_dif', 'pickup_geohash', 'dropoff_geohash']] | |
| # Encode the categorical variables | |
| X_encoded = pd.get_dummies(X, columns=['month', 'day_of_week', 'hour_of_day', 'pickup_geohash', 'dropoff_geohash']) | |
| # Check distribution of features | |
| X.hist(figsize = (10,10)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Split Training and Testing sets | |
| X_train, y_train, X_test, y_test = train_test_split(X_encoded, y) | |
| # Multivariate Linear Model | |
| lm = LinearRegression().fit(X_train,y_train) | |
| model_results(X_train, y_train, X_test, y_test, lm) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Lasso Regression to regularize | |
| from sklearn.linear_model import Lasso | |
| las = Lasso().fit(X_train,y_train) | |
| model_results(X_train, y_train, X_test, y_test, las) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Lasso Regression Hyperparameter tuning | |
| import time | |
| alphas = np.logspace(-4, -0.5, 30) | |
| n_folds = 5 | |
| lasso_params = {'alpha': alphas} | |
| # GridSearch CV for hyperparameter tuning | |
| from sklearn.model_selection import GridSearchCV | |
| start = time.time() | |
| las_tuned = GridSearchCV(Lasso(max_iter = 10000, random_state = 0), lasso_params, cv = n_folds) | |
| las_tuned.fit(X_train, y_train) | |
| end = time.time() | |
| print('GridSearchCV execution time: {}'.format(end - start)) | |
| print('Lasso best params: {}\n'.format(las_tuned.best_params_)) | |
| model_results(X_train, y_train, X_test, y_test, las_tuned) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # LassoCV for hyperparameter tuning | |
| from sklearn.linear_model import LassoCV | |
| start = time.time() | |
| las_cv = LassoCV(cv = n_folds, alphas = alphas, max_iter = 10000, random_state = 0) | |
| las_cv.fit(X_train,y_train) | |
| end = time.time() | |
| print('LassoCV execution time: {}'.format(end - start)) | |
| print('LassoCV best params: {}\n'.format(las_cv.alpha_)) | |
| model_results(X_train, y_train, X_test, y_test, las_cv) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Plot the Actual vs Predicted to visually check the correlation | |
| y_train_pred = las_cv.predict(X_train) | |
| y_test_pred = las_cv.predict(X_test) | |
| test_data_fit = LinearRegression().fit(y_test_pred.reshape(-1, 1),y_test) | |
| slope = test_data_fit.coef_[0] | |
| intercept = test_data_fit.intercept_ | |
| line = slope * y_test_pred + intercept | |
| #plt.scatter(y_train_pred, y_train, alpha = 0.2, label = 'Training Data (2015)') | |
| plt.scatter(y_test_pred, y_test, alpha = 0.5, label = 'Test Data (2016)') | |
| plt.plot(y_test_pred, line, 'r', label='y = ({:.2f}) x + ({:.2f})'.format(slope,intercept)) | |
| plt.legend(loc = 'lower right') | |
| plt.xlabel('Predicted Fare ($)') | |
| plt.ylabel('Actual Fare ($)'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment