Created
April 30, 2016 17:03
-
-
Save matmoody/531456524169716929947f39fb043793 to your computer and use it in GitHub Desktop.
Data analysis of CitiBike data. Checking which locations have the most activity on Saturday mornings.import requests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import requests | |
import sqlite3 as lite | |
import time | |
from pandas.io.json import json_normalize | |
from dateutil.parser import parse | |
%matplotlib inline | |
r = requests.get('http://www.citibikenyc.com/stations/json') | |
#unique list of keys for each station listing | |
key_list = [] | |
for station in r.json()['stationBeanList']: | |
for k in station.keys(): | |
if k not in key_list: | |
key_list.append(k) | |
df = json_normalize(r.json()['stationBeanList']) | |
con = lite.connect('citi_bike.db') | |
cur = con.cursor() | |
with con: | |
cur.execute('CREATE TABLE citibike_reference (id INT PRIMARY KEY, totalDocks INT, city TEXT, altitude INT, stAddress2 TEXT, longitude NUMERIC, postalCode TEXT, testStation TEXT, stAddress1 TEXT, stationName TEXT, landMark TEXT, latitude NUMERIC, location TEXT )') | |
#extract the column from the DataFrame and put them into a list | |
station_ids = df['id'].tolist() | |
#add the '_' to the station name and also add the data type for SQLite | |
station_ids = ['_' + str(x) + ' INT' for x in station_ids] | |
#create the table | |
#in this case, we're concatenating the string and joining all the station ids (now with '_' and 'INT' added) | |
with con: | |
cur.execute("CREATE TABLE available_bikes ( execution_time INT, " + ", ".join(station_ids) + ");") | |
#take the string and parse it into a Python datetime object | |
exec_time = parse(r.json()['executionTime']) | |
for i in range(60): | |
r = requests.get('http://www.citibikenyc.com/stations/json') | |
exec_time = parse(r.json()['executionTime']).strftime("%s") | |
cur.execute('INSERT INTO available_bikes (execution_time) VALUES (?)', (exec_time,)) | |
for station in r.json()['stationBeanList']: | |
cur.execute("UPDATE available_bikes SET _%d = %d WHERE execution_time = %s" % (station['id'], station['availableBikes'], exec_time)) | |
con.commit() | |
time.sleep(60) | |
con.close() | |
df = pd.read_sql_query("SELECT * FROM available_bikes ORDER BY execution_time", con, index_col='execution_time') | |
hour_change = collections.defaultdict(int) | |
for col in df.columns: | |
station_vals = df[col].tolist() | |
# strip the "_" | |
station_id = col[1:] | |
station_change = 0 | |
for k,v in enumerate(station_vals): | |
if k < len(station_vals) - 1: | |
station_change += abs(station_vals[k] - station_vals[k+1]) | |
# convert station id back to int | |
hour_change[int(station_id)] = station_change | |
def keywithmaxval(d): | |
"""Find key with greatest value""" | |
return max(d, key=lambda k: d[k]) | |
# assing max key to max_station | |
max_station = keywithmaxval(hour_change) | |
#query sqlite for reference information | |
cur.execute("SELECT id, stationname, latitude, longitude FROM citibike_reference WHERE id = ?", (max_station,)) | |
data = cur.fetchone() | |
print "The most active station is station id %s at %s latitude: %s longitude: %s" % data | |
print "With %d bicycles coming and going in the hour between %s and %s" % ( | |
hour_change[max_station], | |
datetime.datetime.fromtimestamp(int(df.index[0])).strftime('%Y-%m-%dT%H:%M:%S'), | |
datetime.datetime.fromtimestamp(int(df.index[-1])).strftime("%Y-%m-%dT%H:%M:%S"), | |
) | |
# Visually inspect ^ result. | |
plt.bar(hour_change.keys(), hour_change.values()) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment