Skip to content

Instantly share code, notes, and snippets.

@matmoody
Created April 30, 2016 17:03
Show Gist options
  • Save matmoody/531456524169716929947f39fb043793 to your computer and use it in GitHub Desktop.
Save matmoody/531456524169716929947f39fb043793 to your computer and use it in GitHub Desktop.
Data analysis of CitiBike data. Checking which locations have the most activity on Saturday mornings.import requests
import collections
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlite3 as lite
import time
from pandas.io.json import json_normalize
from dateutil.parser import parse
%matplotlib inline
r = requests.get('http://www.citibikenyc.com/stations/json')
#unique list of keys for each station listing
key_list = []
for station in r.json()['stationBeanList']:
for k in station.keys():
if k not in key_list:
key_list.append(k)
df = json_normalize(r.json()['stationBeanList'])
con = lite.connect('citi_bike.db')
cur = con.cursor()
with con:
cur.execute('CREATE TABLE citibike_reference (id INT PRIMARY KEY, totalDocks INT, city TEXT, altitude INT, stAddress2 TEXT, longitude NUMERIC, postalCode TEXT, testStation TEXT, stAddress1 TEXT, stationName TEXT, landMark TEXT, latitude NUMERIC, location TEXT )')
#extract the column from the DataFrame and put them into a list
station_ids = df['id'].tolist()
#add the '_' to the station name and also add the data type for SQLite
station_ids = ['_' + str(x) + ' INT' for x in station_ids]
#create the table
#in this case, we're concatenating the string and joining all the station ids (now with '_' and 'INT' added)
with con:
cur.execute("CREATE TABLE available_bikes ( execution_time INT, " + ", ".join(station_ids) + ");")
#take the string and parse it into a Python datetime object
exec_time = parse(r.json()['executionTime'])
for i in range(60):
r = requests.get('http://www.citibikenyc.com/stations/json')
exec_time = parse(r.json()['executionTime']).strftime("%s")
cur.execute('INSERT INTO available_bikes (execution_time) VALUES (?)', (exec_time,))
for station in r.json()['stationBeanList']:
cur.execute("UPDATE available_bikes SET _%d = %d WHERE execution_time = %s" % (station['id'], station['availableBikes'], exec_time))
con.commit()
time.sleep(60)
con.close()
df = pd.read_sql_query("SELECT * FROM available_bikes ORDER BY execution_time", con, index_col='execution_time')
hour_change = collections.defaultdict(int)
for col in df.columns:
station_vals = df[col].tolist()
# strip the "_"
station_id = col[1:]
station_change = 0
for k,v in enumerate(station_vals):
if k < len(station_vals) - 1:
station_change += abs(station_vals[k] - station_vals[k+1])
# convert station id back to int
hour_change[int(station_id)] = station_change
def keywithmaxval(d):
"""Find key with greatest value"""
return max(d, key=lambda k: d[k])
# assing max key to max_station
max_station = keywithmaxval(hour_change)
#query sqlite for reference information
cur.execute("SELECT id, stationname, latitude, longitude FROM citibike_reference WHERE id = ?", (max_station,))
data = cur.fetchone()
print "The most active station is station id %s at %s latitude: %s longitude: %s" % data
print "With %d bicycles coming and going in the hour between %s and %s" % (
hour_change[max_station],
datetime.datetime.fromtimestamp(int(df.index[0])).strftime('%Y-%m-%dT%H:%M:%S'),
datetime.datetime.fromtimestamp(int(df.index[-1])).strftime("%Y-%m-%dT%H:%M:%S"),
)
# Visually inspect ^ result.
plt.bar(hour_change.keys(), hour_change.values())
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment