Created
April 18, 2017 03:13
-
-
Save linwoodc3/f407377dc0b1ed8f2db163d132c2b386 to your computer and use it in GitHub Desktop.
matplotlib plotting functions for my District Data Labs Twitter post.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Linwood Creekmore | |
# email: [email protected] | |
# date: 17 April 2017 | |
import matplotlib.pyplot as plt | |
from matplotlib import gridspec | |
import matplotlib.patches as patches | |
import datetime | |
import pandas as pd | |
import numpy as np | |
def countplot(geodataframe,data,colorlist): | |
'''A simple bar plot of magnitude for each language | |
This creates a bar plot with the Economist theme. | |
The data is a filtered pandas Series representing counts | |
of each language to be displayed. This assumes the input | |
Series is generated from the `reader` function. *Requires pandas library.* | |
Parameters | |
---------- | |
geodataframe : geopandas GeoDataFrame | |
geopandas dataframe with original data | |
data : pandas Series | |
pandas Series with counts of languages. | |
Returns | |
------- | |
matplotlib plot | |
Economist-styled plot of the magnitude of language occurence. | |
''' | |
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(22, 15)) | |
gs = gridspec.GridSpec(2, 2, height_ratios=[3, 1],width_ratios=[3,1]) | |
ax1 = plt.subplot(gs[0]) | |
b=(geodataframe[geodataframe.lang !='und'].groupby('lang')['lang'].count()) | |
colors = colorlist #['014d64','6794a7', '7ad2f6', '01a2d9', '76c0c1','00887d','97b6b0','d7d29e','1a476e','90353b','9c8847','938dd2','6e8e84','c10534','cac27e'] | |
colors = list(map(lambda x: "#{0}".format(x),colors)) | |
# b[b>(b.sum()*.02)] | |
# plot and highlight highest bar | |
b1 = data[data>(data.sum()*.02)].sort_values(ascending=False).plot(kind='bar', | |
linewidth=[2.5,0,0,0,0,0,0], | |
edgecolor=['red','#EDEDED','#EDEDED', | |
'#EDEDED','#EDEDED','#EDEDED','#EDEDED'], | |
color=colors, | |
stacked=True,figsize=(20,17)) | |
ax1.grid(False) | |
ax1.yaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0) | |
ax1.set_xticklabels(['English',"Indonesian",'Japanese',"Spanish", | |
'Turkish','Portuguese','Tagalog (Filipino)'],rotation=45) | |
for tick_label in ax1.yaxis.get_ticklabels(): | |
tick_label.set_fontsize(20) | |
for tick_label in ax1.xaxis.get_ticklabels(): | |
tick_label.set_fontsize(20) | |
ax1.set_xlabel('Language of Tweet',fontsize=24) | |
ax1.set_ylabel('Count',fontsize=24) | |
ax1.set_axisbelow(True) | |
ax1.annotate(xy=(0.3,185000),xytext=(2.3, 119000), fontsize=22,style='italic', | |
s=('Twitter is a US-based company so it\ncomes as no surprise that' | |
' the English\nlanguage dominates our sample of\n~600,000 tweets.' | |
), | |
bbox={'facecolor':'#6794a7', 'alpha':0.5, 'pad':10}, | |
arrowprops=dict(facecolor='black', shrink=0.05), | |
multialignment='left') | |
plt.suptitle('Top Tweeted Languages', | |
fontsize=47,fontweight='bold') | |
# plt.text(7.9, data.max()*1.45, "Data by Linwood\nhttps://goo.gl/pV7Oqt", fontsize=9.5, | |
# style='normal',ha='center',va='top', wrap=True,multialignment='right') | |
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg') | |
# newax = f.add_axes([.87, 0.96, 0.1, 0.1], anchor='SW', zorder=10) | |
# newax.imshow(im) | |
# newax.axis('off') | |
ax2 = plt.subplot(gs[1]) | |
ax2.xaxis.set_visible(False) | |
ax2.yaxis.set_visible(False) | |
data = data.values[:,np.newaxis] | |
rowColors = ['#eceff6',"#d0daec"] | |
colLabels = "Language" | |
rowLabels = ["English",'Indonesian','Japanese','Spanish','Turkish'\ | |
,'Portuguese','Tagalog','Thai','Russian','French','Italian',\ | |
'German','Estonian','Arabic','Dutch']#counted.index.values[:,np.newaxis] | |
the_table = ax2.table(cellText=data, | |
rowLabels=rowLabels, | |
colWidths=[0.25, 0.25], | |
rowColours=["#d0daec"]*16, | |
colColours=['#eceff6'], | |
cellColours=np.array(['#eceff6']*15)[:,np.newaxis], | |
loc='center') | |
ax2.axis('tight') | |
the_table.set_fontsize(20) | |
the_table.scale(2.5, 3.9) | |
ax2.set_axis_off() | |
the_table.properties() | |
for key, cell in the_table.get_celld().items(): | |
cell.set_linewidth(0.2) | |
plt.show() | |
def countryplot(geodataframe,data,colorlist): | |
'''A simple bar plot of magnitude for Twitter usage | |
by country. | |
This creates a bar plot with the Economist theme. | |
The data is a filtered pandas Series representing counts | |
of each language to be displayed. This assumes the input | |
Series is generated from the `reader` function. | |
*Requires pandas library.* | |
Parameters | |
---------- | |
geodataframe : geopandas GeoDataFrame | |
geopandas dataframe with original data | |
data : pandas Series | |
pandas Series with counts of languages. | |
Returns | |
------- | |
matplotlib plot | |
Economist-styled plot of the magnitude of Twitter | |
usage in each country. | |
''' | |
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(22, 17),frameon=True) | |
gs = gridspec.GridSpec(2, 2, height_ratios=[3,1],width_ratios=[3,1]) | |
# first plot | |
ax1 = plt.subplot(gs[0]) | |
colors = colorlist | |
# group by country | |
try: | |
countrycount = geodataframe.groupby(['NAME'])['NAME'].count() | |
except: | |
countrycount = geodataframe.groupby(['name'])['name'].count() | |
data.sort_values(ascending=True).plot( | |
kind='barh', | |
ax=ax1, | |
colormap='RdBu_r',figsize=(20,17)) | |
# adding grids on horizontal line only | |
ax1.yaxis.label.set_visible(False) | |
ax1.grid(False) | |
ax1.xaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0) | |
#changing y and x tick label size | |
for tick_label in ax1.yaxis.get_ticklabels(): | |
tick_label.set_fontsize(24) | |
for tick_label in ax1.xaxis.get_ticklabels(): | |
tick_label.set_fontsize(24) | |
# overarching title | |
plt.suptitle('Top Tweeting Countries', | |
fontsize=38,fontweight='bold') | |
# adding text annotation | |
ax1.text(x=38000,y= 2.6, | |
fontsize=22, | |
s=('Surprisingly, the United States is not the\ngreatest user' | |
' of Twitter in our dataset. This\ncould be for a number of reasons,\n' | |
'especially given my unscientific retreival\nof the data.' | |
' We do however, see some\nconsistency in the data. Seven of the top ten\n' | |
'Twitter-using countries are in the ' | |
'top 10\nof my unscientifically collected dataset.'), | |
bbox={'facecolor':'#6794a7', 'alpha':0.5, 'pad':18}, | |
multialignment='left') | |
ax1.text(37000, 0.98, 'Comparison Source:\nNumber of active Twitter users in leading markets as of May 2016 \nhttps://www.statista.com/', style='italic', | |
bbox={'facecolor':'whitesmoke', 'alpha':0.5, 'pad':10},fontsize=14) | |
y=data.sort_values(ascending=False)[:15].sort_values(ascending=True).values | |
# adding labels to horizontal bar | |
for i, v in enumerate(y): | |
if v > 10000: | |
ax1.text(v-8200 , i-.13, str(v), color='white', fontweight='bold', fontsize=18) | |
else: | |
ax1.text(v-5700 , i-.13, str(v), color='white', fontweight='bold', fontsize=18) | |
# add text to my little avatar | |
# plt.text(data.max()*1.282, 16.3, "Data by Linwood\nhttps://goo.gl/pV7Oqt", fontsize=16, | |
# style='normal',ha='center',va='top', wrap=True,multialignment='right') | |
# # read in my avatar and plot on new axis | |
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg') | |
# newax = f.add_axes([.89, .98, .05, .05], anchor='SW', zorder=10) | |
# newax.imshow(im) | |
# newax.axis('off') | |
# second image; the table | |
ax2 = plt.subplot(gs[1]) | |
ax2.xaxis.set_visible(False) | |
ax2.yaxis.set_visible(False) | |
datain = data.sort_values(ascending=False)[:14].values[:,np.newaxis] | |
rowColors = ['#eceff6',"#d0daec"] | |
colLabels = "Count" | |
rowLabels = countrycount.sort_values(ascending=False)[:14].index.values | |
the_table = ax2.table(cellText=datain, | |
rowLabels=rowLabels, | |
colWidths=[0.10, 0.10], | |
rowColours=["#d0daec"]*16, | |
colColours=['#eceff6'], | |
cellColours=np.array(['#eceff6']*14)[:,np.newaxis], | |
loc='center') | |
ax2.axis('tight') | |
the_table.set_fontsize(23) | |
the_table.scale(3,3.2) | |
ax2.set_axis_off() | |
the_table.properties() | |
for key, cell in the_table.get_celld().items(): | |
cell.set_linewidth(0.2) | |
# plt.savefig('twitterusagecountryplot2.png') | |
plt.show() | |
def hourplot(geodataframe,country1='United States',country2='Indonesia'): | |
'''Function that compares local time occurences. | |
Function extracts Indonesia and United States | |
originating tweets from the data set. Then, | |
it converts each datetime to local time and | |
creates a barplot to compare the count of | |
tweets by local time hour of the day.* | |
Parameters | |
---------- | |
geodataframe : geopandas GeoDataFrame | |
geopandas dataframe with original data | |
Returns | |
------- | |
matplotlib plot | |
Economist-styled plot of the magnitude of tweets | |
by local time hour of the day. | |
''' | |
us_count = geodataframe.normtime[geodataframe.NAME==country1]\ | |
.groupby(geodataframe.normtime.apply(lambda x:x.hour)).size() | |
indo_count=geodataframe.normtime[geodataframe.NAME==country2]\ | |
.groupby(geodataframe.normtime.apply(lambda x:x.hour)).size() | |
f,ax = plt.subplots(figsize=(20,12)) | |
ax.set_xlabel('Busiest Hour of the Day for Tweets (Normalized Local Time)',fontsize=30) | |
# adding grids on horizontal line only | |
ax.set_xticks(np.arange(24),minor=True) | |
#changing y and x tick label size | |
for tick_label in ax.yaxis.get_ticklabels(): | |
tick_label.set_fontsize(20) | |
for tick_label in ax.xaxis.get_ticklabels(): | |
tick_label.set_fontsize(20) | |
us_count.plot(kind='bar',width=0.8,ax=ax,color='#01a2d9',\ | |
label=country1,alpha=1,zorder=10) | |
ax.bar(np.arange(indo_count.index.values.shape[0]),\ | |
indo_count.values,color='#014d64',\ | |
width=0.8,label=country2) | |
ax.set_xlabel('Hour of the Day (Country Local Time)',fontsize=22) | |
plt.setp( ax.xaxis.get_majorticklabels(), rotation=45 ) | |
ax.legend(fontsize=22) | |
ax.grid(False) | |
ax.yaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0) | |
#highlight rectangle | |
ax.add_patch( | |
patches.Rectangle( | |
(9.8, 0), # (x,y) | |
9, # width | |
11300, # height | |
fill=True, color="#ff6d6d", | |
alpha=0.3, zorder=-1)) | |
ax.annotate(s=("Huge gap in U.S. data at local peak\ntime" | |
" for Twitter usage (1100-1300 Local)\n" | |
"just when Indonesia Twitter usage peaks. \n" | |
"This explains our anomaly."), | |
xy=(11.5,10000),xytext=(1,9500),arrowprops=dict(facecolor='black',\ | |
shrink=0.05),\ | |
bbox={'facecolor':'#6794a7', 'alpha':.8, 'pad':10},\ | |
fontsize=20,style='italic') | |
ax.text(1, 8380, ('Comparison Source:\nThe Biggest Social Media Science Study: What' | |
' 4.8 Million Tweets \nSay About the Best Time to Tweet\nhttps://blog' | |
'.bufferapp.com/best-time-to-tweet-research'), style='italic', | |
bbox={'facecolor':'whitesmoke', 'alpha':0.5, 'pad':10},fontsize=10) | |
# plt.text(20.2, us_count.max()*2.1, "Data by Linwood\nhttps://goo.gl/pV7Oqt", | |
# fontsize=16,style='normal',ha='center',va='top', wrap=True,multialignment='right') | |
# # read in my avatar and plot on new axis | |
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg') | |
# newax = f.add_axes([.9, 1, .05, .08], anchor='SW', zorder=10) | |
# newax.imshow(im) | |
# newax.axis('off') | |
plt.suptitle('United States Data has Gap During Peak Usage Time (in Local Time Hours)',\ | |
fontsize=25,fontweight='bold') | |
plt.legend(loc='best') | |
# plt.savefig('linwoodSample_normalizedTimeAnalysis.png') | |
plt.show() | |
def waterplot(geodataframe,data,colorlist): | |
'''A simple bar plot of magnitude for Twitter usage | |
by body of water. | |
This creates a bar plot with the Economist theme. | |
The data is a filtered pandas Series representing counts | |
of each language to be displayed. This assumes the input | |
Series is generated from the `reader` function. | |
*Requires pandas library.* | |
Parameters | |
---------- | |
geodataframe : geopandas GeoDataFrame | |
geopandas dataframe with original data | |
data : pandas Series | |
pandas Series with counts of languages. | |
Returns | |
------- | |
matplotlib plot | |
Economist-styled plot of the magnitude of Twitter | |
usage in each country. | |
''' | |
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(22, 17),frameon=True) | |
gs = gridspec.GridSpec(2, 2, height_ratios=[6,1],width_ratios=[5,1]) | |
# first plot | |
ax1 = plt.subplot(gs[0]) | |
colors = colorlist | |
# group by country | |
try: | |
countrycount = geodataframe.groupby(['NAME'])['NAME'].count() | |
except: | |
countrycount = geodataframe.groupby(['name'])['name'].count() | |
data.sort_values(ascending=True).plot( | |
kind='barh', | |
ax=ax1, | |
colormap='RdBu_r',figsize=(20,15)) | |
# adding grids on horizontal line only | |
ax1.yaxis.label.set_visible(False) | |
ax1.grid(False) | |
ax1.xaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0) | |
#changing y and x tick label size | |
for tick_label in ax1.yaxis.get_ticklabels(): | |
tick_label.set_fontsize(18) | |
for tick_label in ax1.xaxis.get_ticklabels(): | |
tick_label.set_fontsize(22) | |
# overarching title | |
plt.suptitle('Top 10 Bodies of Water by Count of Tweets', | |
fontsize=38,fontweight='bold') | |
# adding text annotation | |
ax1.text(x=440,y= 1.6, | |
fontsize=22, | |
s=('A good number of tweets occur from bodies\n' | |
'of water. The North Atlantic and Pacific\n' | |
'Oceans likley dominate because they hold\n' | |
'major trade/travel routes.'), | |
bbox={'facecolor':'#6794a7', 'alpha':0.5, 'pad':18}, | |
multialignment='left') | |
# ax1.text(37000, 0.98, 'Comparison Source:\nNumber of active Twitter users in leading markets as of May 2016 \nhttps://www.statista.com/', style='italic', | |
# bbox={'facecolor':'whitesmoke', 'alpha':0.5, 'pad':10},fontsize=14) | |
y=data.sort_values(ascending=False)[:10].sort_values(ascending=True).values | |
# adding labels to horizontal bar | |
for i, v in enumerate(y): | |
if v > 10000: | |
ax1.text(v-70 , i-.13, str(v), color='white', fontweight='bold', fontsize=18) | |
else: | |
ax1.text(v-70 , i-.13, str(v), color='white', fontweight='bold', fontsize=24) | |
# add text to my little avatar | |
# plt.text(data.max()*1.282, 16.3, "Data by Linwood\nhttps://goo.gl/pV7Oqt", fontsize=16, | |
# style='normal',ha='center',va='top', wrap=True,multialignment='right') | |
# # read in my avatar and plot on new axis | |
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg') | |
# newax = f.add_axes([.89, .98, .05, .05], anchor='SW', zorder=10) | |
# newax.imshow(im) | |
# newax.axis('off') | |
# second image; the table | |
ax2 = plt.subplot(gs[1]) | |
ax2.xaxis.set_visible(False) | |
ax2.yaxis.set_visible(False) | |
datain = data.sort_values(ascending=False)[:10].values[:,np.newaxis] | |
rowColors = ['#eceff6',"#d0daec"] | |
colLabels = "Count" | |
rowLabels = countrycount.sort_values(ascending=False)[:10].index.values | |
the_table = ax2.table(cellText=datain, | |
rowLabels=rowLabels, | |
colWidths=[0.10, 0.20], | |
rowColours=["#d0daec"]*16, | |
colColours=['#eceff6'], | |
cellColours=np.array(['#eceff6']*10)[:,np.newaxis], | |
loc='center') | |
# ax2.axis('tight') | |
the_table.set_fontsize(20) | |
the_table.scale(1.8,4.3) | |
ax2.set_axis_off() | |
the_table.properties() | |
for key, cell in the_table.get_celld().items(): | |
cell.set_linewidth(0.2) | |
# plt.savefig('twitterusagecountryplot2.png') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment