Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Last active October 6, 2022 19:57
Show Gist options
  • Save Eligijus112/3fd9f46fa2f3c2de12e2311d16e59ec0 to your computer and use it in GitHub Desktop.
Save Eligijus112/3fd9f46fa2f3c2de12e2311d16e59ec0 to your computer and use it in GitHub Desktop.
Date conversion and feature engineering for NYC cab data
import pandas as pd
from datetime import datetime
import numpy as np
import re
# To datetime conversion
def to_datetime(x: str) -> datetime:
"""
Converts a string to a datetime object
An example of the string is 2010-02-02 17:24:55
"""
# Inspecting whether x is datetime
if isinstance(x, datetime):
return x
try:
# Dropping the UTC part from the date strings
x = re.sub(' UTC', '', x)
return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
except:
print(f"Error converting {x} to datetime")
return pd.to_datetime(x)
def create_date_vars(
d: pd.DataFrame,
date_var: str = 'pickup_datetime'
) -> pd.DataFrame:
"""
Creates the datetime variables
Creates the following columns
* pickup_dayofweek - The day of the week at pickup time
* pickup_hour - The hour of the day at pickup time
* pickup_dayofyear - The day of the year at pickup time
* pickup_hour_sin, pickup_hour_cos - The sine and cosine of the hour of the day
* pickup_dayofyear_sin, pickup_dayofyear_cos - The sine and cosine of the day of the year
"""
# Infering the day of the week from pickup_datetime
d[date_var] = [to_datetime(x) for x in d[date_var]]
d['pickup_dayofweek'] = d[date_var].dt.dayofweek
# Infering the hour of the day from pickup_datetime
d['pickup_hour'] = d[date_var].dt.hour
# Creating a new variable for the day of the year
d['pickup_dayofyear'] = d[date_var].dt.dayofyear
# Ensuring a monotonic relationship between pickup_hour and pickup_dayofyear
d['pickup_hour_sin'] = np.sin(2 * np.pi * d['pickup_hour']/23.0)
d['pickup_hour_cos'] = np.cos(2 * np.pi * d['pickup_hour']/23.0)
d['pickup_dayofyear_sin'] = np.sin(2 * np.pi * d['pickup_dayofyear']/365.0)
d['pickup_dayofyear_cos'] = np.cos(2 * np.pi * d['pickup_dayofyear']/365.0)
return d
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment