Skip to content

Instantly share code, notes, and snippets.

# Extract the unique routes from the data with their start and end coordinates
maps = df[['Route Id', 'Station Id From', 'Lat From', 'Lon From', 'Station Id To', 'Lat To', 'Lon To']].drop_duplicates()
maps['Distance'] = maps.apply(lambda x: distance.distance((x['Lat From'], x['Lon From']), (x['Lat To'], x['Lon To'])).km * 1000, axis=1)
# Merge the distance calculation with the main DataFrame
df = df.merge(maps[['Route Id', 'Distance']], how='left', on='Route Id')
df = df[[x for x in df.columns if not 'From' in x and not 'To' in x and x != 'Route Id']]
fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(15, 15))
for i, col in enumerate(['Duration', 'Distance']):
# Graph the distribution plots (seaborn doesn't allow default separation for distplot)
for j, user in enumerate(df['User Type'].unique()):
sns.distplot(df[df['User Type']==user][col], hist=False, ax=ax2[i])
# Graph the boxplots
sns.boxplot(x=col, y='User Type', data=df, ax=ax1[i])
ax1[i].set_xlabel('')
# Aggregate the ridership by date fields and user type and count the number of unique trips
ridership = df.groupby(['Date', 'Quarter', 'Month', 'Day of Week', 'User Type'])['Id']\
.nunique().reset_index().sort_values('User Type', ascending=False)
p = sns.relplot(x='Date', y='Id', hue='User Type', data=ridership, kind='line', height=9, aspect=16/9.)
p.ax.set_ylabel('Total Daily Trips')
# Limit the number of labels shown on the x-axis to only show the first day of each month
p.ax.set_xticklabels([x.set_text('') if not x.get_text().endswith('-01') else x for x in p.ax.get_xticklabels()], rotation=45)
fig, axes = plt.subplots(1, 2, figsize=(20, 7))
for ax, m in zip(axes, ['January', 'July']):
sns.pointplot(x='Date', y='Id', data=ridership[(ridership['Month']==m)&(ridership['User Type']=='Member')].groupby('Date')['Id'].sum().reset_index(), ax=ax, markers='')
ax.set_title(m)
ax.set_ylabel('Total Daily Trips')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
fig, axes = plt.subplots(2, 2, figsize=(15, 15))
# Flatten the 2D axes array for ease of looping
axes = np.array(axes).flatten()
# Prepare the month description titles for each quarter
quarter_names = ['Jan. - Mar.', 'Apr. - Jun.', 'Jul. - Sept.', 'Oct. - Dec.']
for q, ax in zip(sorted(ridership['Quarter'].unique()), axes):
ax.set_title(quarter_names[(q-1)])
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
axes = np.array(axes).flatten()
for m, ax in zip(ridership[ridership['Quarter']==3]['Month'].unique(), axes):
ax.set_title(m)
ax.set_ylim(0, 7000)
ax.set_ylabel('Average Daily Trips')
sns.barplot(
x='Day of Week',
y='Id',
data = df.groupby(['Date', 'Hour', 'User Type'])['Id'].nunique().groupby(['Hour', 'User Type']).mean().reset_index()
fig, ax = plt.subplots(figsize=(16, 9))
sns.barplot(x='Hour', y='Id', hue='User Type', data=data, ax=ax)
ax.set_ylabel('Average Hourly Trips')
# Import the weather data and drop the first 22 rows (containing descriptions of the weather station)
weather = pd.read_csv('./data/weather.csv', header=22)
# Remove units contained in the column names (eg. Celcius, mm, etc.)
weather.columns = [re.sub(r'\([^()]*\)', '', x).strip() if x != 'Date/Time' else 'Date' for x in weather.columns]
data = df.groupby(['Date', 'User Type'])['Id'].nunique().to_frame().pivot_table(index='Date', columns='User Type').reset_index()
data.columns = ['Date', 'Casual Trips', 'Member Trips']
data = data.merge(weather[['Date', 'Mean Temp', 'Total Precip']], on='Date', how='inner')
fig, ax = plt.subplots(figsize=(16, 9))
ax2 = ax.twinx() # Create the twin axis to enable display of ridership and temperature on the same graph
palette = sns.color_palette() # Get the default color palette
for i, user_type in enumerate(['Casual Trips', 'Member Trips']):
sns.lineplot(x='Date', y=user_type, data=data, ax=ax, color=palette[i], markers='')
sns.pointplot(x='Date', y='Mean Temp', data=data, ax=ax2, color=palette[2], markers='x')