dottyz’s gists

dottyz / story_bike_share_analyze_3.py

Created May 2, 2019 18:37

	# Extract the unique routes from the data with their start and end coordinates
	maps = df[['Route Id', 'Station Id From', 'Lat From', 'Lon From', 'Station Id To', 'Lat To', 'Lon To']].drop_duplicates()
	maps['Distance'] = maps.apply(lambda x: distance.distance((x['Lat From'], x['Lon From']), (x['Lat To'], x['Lon To'])).km * 1000, axis=1)

	# Merge the distance calculation with the main DataFrame
	df = df.merge(maps[['Route Id', 'Distance']], how='left', on='Route Id')
	df = df[[x for x in df.columns if not 'From' in x and not 'To' in x and x != 'Route Id']]

dottyz / story_bike_share_analyze_4.py

Created May 2, 2019 18:38

	fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(15, 15))
	for i, col in enumerate(['Duration', 'Distance']):
	# Graph the distribution plots (seaborn doesn't allow default separation for distplot)
	for j, user in enumerate(df['User Type'].unique()):
	sns.distplot(df[df['User Type']==user][col], hist=False, ax=ax2[i])

	# Graph the boxplots
	sns.boxplot(x=col, y='User Type', data=df, ax=ax1[i])
	ax1[i].set_xlabel('')

dottyz / story_bike_share_analyze_5.py

Created May 2, 2019 18:38

	# Aggregate the ridership by date fields and user type and count the number of unique trips
	ridership = df.groupby(['Date', 'Quarter', 'Month', 'Day of Week', 'User Type'])['Id']\
	.nunique().reset_index().sort_values('User Type', ascending=False)

dottyz / story_bike_share_analyze_6.py

Created May 2, 2019 18:39

	p = sns.relplot(x='Date', y='Id', hue='User Type', data=ridership, kind='line', height=9, aspect=16/9.)
	p.ax.set_ylabel('Total Daily Trips')

	# Limit the number of labels shown on the x-axis to only show the first day of each month
	p.ax.set_xticklabels([x.set_text('') if not x.get_text().endswith('-01') else x for x in p.ax.get_xticklabels()], rotation=45)

dottyz / story_bike_share_analyze_7.py

Created May 2, 2019 18:39

	fig, axes = plt.subplots(1, 2, figsize=(20, 7))
	for ax, m in zip(axes, ['January', 'July']):
	sns.pointplot(x='Date', y='Id', data=ridership[(ridership['Month']==m)&(ridership['User Type']=='Member')].groupby('Date')['Id'].sum().reset_index(), ax=ax, markers='')

	ax.set_title(m)
	ax.set_ylabel('Total Daily Trips')
	ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

dottyz / story_bike_share_analyze_8.py

Created May 2, 2019 18:40

	fig, axes = plt.subplots(2, 2, figsize=(15, 15))

	# Flatten the 2D axes array for ease of looping
	axes = np.array(axes).flatten()

	# Prepare the month description titles for each quarter
	quarter_names = ['Jan. - Mar.', 'Apr. - Jun.', 'Jul. - Sept.', 'Oct. - Dec.']

	for q, ax in zip(sorted(ridership['Quarter'].unique()), axes):
	ax.set_title(quarter_names[(q-1)])

dottyz / story_bike_share_analyze_9.py

Created May 2, 2019 18:41

	fig, axes = plt.subplots(1, 3, figsize=(18, 6))
	axes = np.array(axes).flatten()
	for m, ax in zip(ridership[ridership['Quarter']==3]['Month'].unique(), axes):
	ax.set_title(m)
	ax.set_ylim(0, 7000)
	ax.set_ylabel('Average Daily Trips')

	sns.barplot(
	x='Day of Week',
	y='Id',

dottyz / story_bike_share_analyze_10.py

Created May 2, 2019 18:41

	data = df.groupby(['Date', 'Hour', 'User Type'])['Id'].nunique().groupby(['Hour', 'User Type']).mean().reset_index()
	fig, ax = plt.subplots(figsize=(16, 9))

	sns.barplot(x='Hour', y='Id', hue='User Type', data=data, ax=ax)
	ax.set_ylabel('Average Hourly Trips')

dottyz / story_bike_share_analyze_11.py

Last active May 3, 2019 14:22

	# Import the weather data and drop the first 22 rows (containing descriptions of the weather station)
	weather = pd.read_csv('./data/weather.csv', header=22)

	# Remove units contained in the column names (eg. Celcius, mm, etc.)
	weather.columns = [re.sub(r'\([^()]*\)', '', x).strip() if x != 'Date/Time' else 'Date' for x in weather.columns]

	data = df.groupby(['Date', 'User Type'])['Id'].nunique().to_frame().pivot_table(index='Date', columns='User Type').reset_index()
	data.columns = ['Date', 'Casual Trips', 'Member Trips']
	data = data.merge(weather[['Date', 'Mean Temp', 'Total Precip']], on='Date', how='inner')

dottyz / story_bike_share_analyze_12.py

Created May 2, 2019 18:43

	fig, ax = plt.subplots(figsize=(16, 9))

	ax2 = ax.twinx() # Create the twin axis to enable display of ridership and temperature on the same graph
	palette = sns.color_palette() # Get the default color palette

	for i, user_type in enumerate(['Casual Trips', 'Member Trips']):
	sns.lineplot(x='Date', y=user_type, data=data, ax=ax, color=palette[i], markers='')

	sns.pointplot(x='Date', y='Mean Temp', data=data, ax=ax2, color=palette[2], markers='x')

Yizhao Tan dottyz