Yizhao Tan dottyz

dottyz / story_bike_share_clean_5.py

Created May 2, 2019 18:32

	stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\
	.merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\
	.drop_duplicates()

	df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \
	.merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \
	.drop_duplicates()

	df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']]

dottyz / story_bike_share_clean_4.py

Created May 2, 2019 18:31

	# Separate the stations without station IDs
	no_ids = stations[stations['station_id'].isnull()]
	for idx, miss in no_ids.iterrows():
	max_score = 0

	# Compare the similarity of the station without ID to each station in the API data
	for i, exist in bikeshare_stations[['station_id', 'name']].iterrows():
	score = fuzz.ratio(miss['name'], exist['name'])

	if score > 80 and score > max_score:

dottyz / story_bike_share_clean_3.py

Created May 2, 2019 18:30

	stations_start = df[['from_station_id', 'from_station_name']]
	stations_end = df[['to_station_id', 'to_station_name']]
	stations_start.columns = stations_end.columns = ['station_id', 'name']

	# Extracts the unique station ID and name combination from the from_station and to_station columns
	stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True)

dottyz / story_bike_share_clean_2.py

Created May 2, 2019 18:29

	# Identify the date structure used by each of the files as a dict:
	# * Key: data file name
	# * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone]
	date_formats = {
	'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4],
	'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4],
	'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0],
	'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0],
	}
	df = pd.DataFrame() # Initiate an empty DataFrame

dottyz / story_bike_share_clean_1.py

Created May 2, 2019 18:24

	from datetime import timedelta
	from fuzzywuzzy import fuzz

	import matplotlib.pyplot as plt
	import pandas as pd
	import requests
	import seaborn as sns

	import json
	import os