Skip to content

Instantly share code, notes, and snippets.

stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\
.merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\
.drop_duplicates()
df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \
.merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \
.drop_duplicates()
df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']]
# Separate the stations without station IDs
no_ids = stations[stations['station_id'].isnull()]
for idx, miss in no_ids.iterrows():
max_score = 0
# Compare the similarity of the station without ID to each station in the API data
for i, exist in bikeshare_stations[['station_id', 'name']].iterrows():
score = fuzz.ratio(miss['name'], exist['name'])
if score > 80 and score > max_score:
stations_start = df[['from_station_id', 'from_station_name']]
stations_end = df[['to_station_id', 'to_station_name']]
stations_start.columns = stations_end.columns = ['station_id', 'name']
# Extracts the unique station ID and name combination from the from_station and to_station columns
stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True)
# Identify the date structure used by each of the files as a dict:
# * Key: data file name
# * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone]
date_formats = {
'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4],
'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4],
'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0],
'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0],
}
df = pd.DataFrame() # Initiate an empty DataFrame
from datetime import timedelta
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns
import json
import os