Skip to content

Instantly share code, notes, and snippets.

fig, axes = plt.subplots(1, 2, figsize=(20, 7))
for ax, m in zip(axes, ['January', 'July']):
sns.pointplot(x='Date', y='Id', data=ridership[(ridership['Month']==m)&(ridership['User Type']=='Member')].groupby('Date')['Id'].sum().reset_index(), ax=ax, markers='')
ax.set_title(m)
ax.set_ylabel('Total Daily Trips')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
p = sns.relplot(x='Date', y='Id', hue='User Type', data=ridership, kind='line', height=9, aspect=16/9.)
p.ax.set_ylabel('Total Daily Trips')
# Limit the number of labels shown on the x-axis to only show the first day of each month
p.ax.set_xticklabels([x.set_text('') if not x.get_text().endswith('-01') else x for x in p.ax.get_xticklabels()], rotation=45)
# Aggregate the ridership by date fields and user type and count the number of unique trips
ridership = df.groupby(['Date', 'Quarter', 'Month', 'Day of Week', 'User Type'])['Id']\
.nunique().reset_index().sort_values('User Type', ascending=False)
fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(15, 15))
for i, col in enumerate(['Duration', 'Distance']):
# Graph the distribution plots (seaborn doesn't allow default separation for distplot)
for j, user in enumerate(df['User Type'].unique()):
sns.distplot(df[df['User Type']==user][col], hist=False, ax=ax2[i])
# Graph the boxplots
sns.boxplot(x=col, y='User Type', data=df, ax=ax1[i])
ax1[i].set_xlabel('')
# Extract the unique routes from the data with their start and end coordinates
maps = df[['Route Id', 'Station Id From', 'Lat From', 'Lon From', 'Station Id To', 'Lat To', 'Lon To']].drop_duplicates()
maps['Distance'] = maps.apply(lambda x: distance.distance((x['Lat From'], x['Lon From']), (x['Lat To'], x['Lon To'])).km * 1000, axis=1)
# Merge the distance calculation with the main DataFrame
df = df.merge(maps[['Route Id', 'Distance']], how='left', on='Route Id')
df = df[[x for x in df.columns if not 'From' in x and not 'To' in x and x != 'Route Id']]
# Clean up column names for ease of use
df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns]
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1)
df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type)
df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type)
df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H'))
import datetime as dt
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy import distance
from pandas.api.types import CategoricalDtype
df.to_csv('./data/bikeshare_ridership.csv', index=False)
# Removing false start trips
df = df[(df['trip_duration_seconds']>=60)]
# Removing outliers
q1 = df['trip_duration_seconds'].quantile(0.25)
q3 = df['trip_duration_seconds'].quantile(0.75)
interquartile_range = q3 - q1
df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax)