dottyz’s gists

dottyz / story_bike_share_analyze_7.py

Created May 2, 2019 18:39

	fig, axes = plt.subplots(1, 2, figsize=(20, 7))
	for ax, m in zip(axes, ['January', 'July']):
	sns.pointplot(x='Date', y='Id', data=ridership[(ridership['Month']==m)&(ridership['User Type']=='Member')].groupby('Date')['Id'].sum().reset_index(), ax=ax, markers='')

	ax.set_title(m)
	ax.set_ylabel('Total Daily Trips')
	ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

dottyz / story_bike_share_analyze_6.py

Created May 2, 2019 18:39

	p = sns.relplot(x='Date', y='Id', hue='User Type', data=ridership, kind='line', height=9, aspect=16/9.)
	p.ax.set_ylabel('Total Daily Trips')

	# Limit the number of labels shown on the x-axis to only show the first day of each month
	p.ax.set_xticklabels([x.set_text('') if not x.get_text().endswith('-01') else x for x in p.ax.get_xticklabels()], rotation=45)

dottyz / story_bike_share_analyze_5.py

Created May 2, 2019 18:38

	# Aggregate the ridership by date fields and user type and count the number of unique trips
	ridership = df.groupby(['Date', 'Quarter', 'Month', 'Day of Week', 'User Type'])['Id']\
	.nunique().reset_index().sort_values('User Type', ascending=False)

dottyz / story_bike_share_analyze_4.py

Created May 2, 2019 18:38

	fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(15, 15))
	for i, col in enumerate(['Duration', 'Distance']):
	# Graph the distribution plots (seaborn doesn't allow default separation for distplot)
	for j, user in enumerate(df['User Type'].unique()):
	sns.distplot(df[df['User Type']==user][col], hist=False, ax=ax2[i])

	# Graph the boxplots
	sns.boxplot(x=col, y='User Type', data=df, ax=ax1[i])
	ax1[i].set_xlabel('')

dottyz / story_bike_share_analyze_3.py

Created May 2, 2019 18:37

	# Extract the unique routes from the data with their start and end coordinates
	maps = df[['Route Id', 'Station Id From', 'Lat From', 'Lon From', 'Station Id To', 'Lat To', 'Lon To']].drop_duplicates()
	maps['Distance'] = maps.apply(lambda x: distance.distance((x['Lat From'], x['Lon From']), (x['Lat To'], x['Lon To'])).km * 1000, axis=1)

	# Merge the distance calculation with the main DataFrame
	df = df.merge(maps[['Route Id', 'Distance']], how='left', on='Route Id')
	df = df[[x for x in df.columns if not 'From' in x and not 'To' in x and x != 'Route Id']]

dottyz / story_bike_share_analyze_2.py

Created May 2, 2019 18:36

	# Clean up column names for ease of use
	df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns]

	df['Start Time'] = pd.to_datetime(df['Start Time'])

	df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d'))
	df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1)
	df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type)
	df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type)
	df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H'))

dottyz / story_bike_share_analyze_1.py

Created May 2, 2019 18:36

	import datetime as dt
	import re

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	from geopy import distance
	from pandas.api.types import CategoricalDtype

dottyz / story_bike_share_clean_8.py

Created May 2, 2019 18:35

df.to_csv('./data/bikeshare_ridership.csv', index=False)

dottyz / story_bike_share_clean_7.py

Created May 2, 2019 18:34

	# Removing false start trips
	df = df[(df['trip_duration_seconds']>=60)]

	# Removing outliers
	q1 = df['trip_duration_seconds'].quantile(0.25)
	q3 = df['trip_duration_seconds'].quantile(0.75)

	interquartile_range = q3 - q1

	df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \

dottyz / story_bike_share_clean_6.py

Created May 2, 2019 18:32

	fig, ax = plt.subplots(1, 1, figsize=(16, 9))
	sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax)

Yizhao Tan dottyz