Last active
February 19, 2019 12:44
-
-
Save jseabold/2b549d8a74711e0d5932 to your computer and use it in GitHub Desktop.
Recreation of Tufte graphic in Python based on an Rstats blog post and gist http://asbcllc.com/blog/2015/January/gotham_2014_weather/ https://gist.github.com/abresler/46c36c1a88c849b94b07
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import calendar | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import FixedLocator, FixedFormatter | |
import pandas as pd | |
import seaborn as sns | |
to_colors = lambda x : x/255. | |
blue3 = map(to_colors, (24, 116, 205)) # 1874CD | |
wheat2 = map(to_colors, (238, 216, 174)) # EED8AE | |
wheat3 = map(to_colors, (205, 186, 150)) # CDBA96 | |
wheat4 = map(to_colors, (139, 126, 102)) # 8B7E66 | |
firebrick3 = map(to_colors, (205, 38, 38)) # CD2626 | |
gray30 = map(to_colors, (77, 77, 77)) # 4D4D4D | |
if not os.path.exists("tufte.csv"): | |
dta = pd.read_table("http://academic.udayton.edu/kissock/http/" | |
"Weather/gsod95-current/NYNEWYOR.txt", sep=" *", | |
names=["month", "day", "year", "temp"]) | |
dta.to_csv("tufte.csv", index=False) | |
else: | |
dta = pd.read_csv("tufte.csv") | |
def calc_summary_stats(x): | |
lower = x.min() | |
upper = x.max() | |
avg = x.mean() | |
std_err = x.std()/np.sqrt(len(x)) | |
ci_upper = avg + 2.101 * std_err | |
ci_lower = avg - 2.101 * std_err | |
return pd.DataFrame.from_dict(dict(lower=lower, upper=upper, | |
avg=avg, std_err=std_err, | |
ci_upper=ci_upper, ci_lower=ci_lower) | |
) | |
dta.set_index(pd.to_datetime(dta.year*10000 + dta.month*100 + dta.day, | |
format="%Y%m%d"), inplace=True) | |
dta = dta[["temp"]].query("temp != -99") | |
past = dta.query("index < 2014") | |
grouped = past.groupby(past.index.map(lambda x : (x.month, x.day))) | |
past_stats = grouped.apply(calc_summary_stats) | |
past_stats.set_index(past_stats.index.droplevel(1), inplace=True) | |
present = dta.query("index >= 2014") | |
grouped = present.groupby(present.index.map(lambda x : (x.month, x.day))) | |
presentlows = grouped.temp.min() | |
presentlows = presentlows.ix[presentlows < | |
past_stats.ix[presentlows.index].lower] | |
presenthighs = grouped.temp.max() | |
presenthighs = presenthighs.ix[presenthighs > | |
past_stats.ix[presenthighs.index].upper] | |
idx = range(len(past_stats)) | |
fig, ax = plt.subplots(figsize=(20, 8), subplot_kw={'axisbg': 'white'}, | |
facecolor='white') | |
# plot the high-low bars | |
ax.vlines(idx, past_stats.lower, past_stats.upper, color=wheat3, alpha=.9, | |
linewidth=1.5, zorder=-1) | |
# plot the confidence interval around the means | |
ax.vlines(idx, past_stats.ci_lower, past_stats.ci_upper, linewidth=1.5, | |
color=wheat4, zorder=-1) | |
# plot the present year time-series | |
ax.plot(present, color='k', zorder=10) | |
# plot the highs and lows of the present year | |
x_highs = np.where(past_stats.index.isin(presenthighs.index))[0] | |
# adjust for no leap day in 2014 | |
x_highs -= 1 | |
ax.plot(x_highs, presenthighs, 'ro') | |
x_lows = np.where(past_stats.index.isin(presentlows.index))[0] | |
# adjust for leap day | |
x_lows[9:] -= 1 | |
ax.plot(x_lows, presentlows, 'bo') | |
# plot the made-up 2014 range. don't know what this was supposed to show. | |
ax.vlines(idx[len(idx) // 2 + 2], -5, 30, linewidth=15, color=wheat2) | |
ax.vlines(idx[len(idx) // 2 + 2], 3, 19, linewidth=15, color=wheat4) | |
ax.errorbar(len(idx) // 2 + 7, 9, yerr=6, capsize=4, capthick=1, | |
color='black') | |
ax.text(len(idx) // 2 + 8, 9, "Normal Range", verticalalignment='center') | |
ax.text(len(idx) // 2 + 7, 30, "Record High") | |
ax.text(len(idx) // 2 + 7, -5, "Record Low", verticalalignment='top') | |
ax.text(len(idx) // 2 - 1, 9, "2014 Temperature", | |
horizontalalignment='right') | |
############## | |
## text data | |
# | |
ax.annotate("We had 30 days that were the\ncoldest since 1995", | |
xy=(x_lows[4], presentlows[4]), xytext=(50, -45), | |
textcoords='offset points', arrowprops=dict(facecolor='blue', | |
width=2, | |
headwidth=0, | |
frac=0, | |
shrink=.05), | |
color='blue', horizontalalignment='left') | |
ax.annotate("We had 5 days that were the\nhottest since 1995", | |
xy=(x_highs[0], presenthighs[0]), xytext=(0, 40), | |
textcoords='offset points', arrowprops=dict(facecolor='red', | |
width=2, | |
headwidth=0, | |
frac=0, | |
shrink=.05), | |
color='red', horizontalalignment='center') | |
ax.text(69, 94, u"Data represents average daily temperatures. Accessible " | |
"data dates back to\nJanuary 1, 1975. Data for 2014 is only " | |
"available through December 16.\nAverage temperature for" | |
u" the year was 54.8\u00b0 making 2014 the 6th coldest\nyear" | |
"since 1995", verticalalignment='top', horizontalalignment='center') | |
############## | |
## formatting | |
# | |
yticks = range(-10, 101, 10) | |
ax.yaxis.set_ticks(yticks) | |
ylabels = [str(i) + u"\u00b0" for i in yticks] | |
ax.yaxis.set_ticklabels(ylabels, fontsize=14) | |
ax.yaxis.grid(color='white', zorder=1) | |
xticks = past.groupby(past.index.month).apply(lambda x : x.index.day.max() | |
).cumsum().values | |
ax.xaxis.set_ticks(xticks) | |
left_spine = ax.spines['left'] | |
left_spine.set_visible(True) | |
left_spine.set_color(wheat4) | |
left_spine.set_linewidth(2) | |
xticks = np.r_[0, xticks] | |
minor_xticks = (xticks[1:] + xticks[:-1])/2 | |
ax.xaxis.set_minor_locator(FixedLocator(minor_xticks)) | |
ax.xaxis.set_minor_formatter(FixedFormatter(calendar.month_name[1:])) | |
ax.xaxis.set_ticklabels([]) | |
ax.xaxis.grid(color=wheat3, linestyle='dotted') | |
ax.set_title(" New York City's Weather in 2014", loc="left", | |
fontsize=23) | |
ax.text(2, 97, " Temperature in Fahrenheit", fontsize=15, | |
fontdict=dict(weight='bold')) | |
ax.set_xlim(0, len(idx)) | |
ax.set_ylim(-10, 100) | |
fig.savefig("tufte.svg") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment