Skip to content

Instantly share code, notes, and snippets.

@inkhorn
Created April 9, 2014 01:42
Show Gist options
  • Save inkhorn/10217838 to your computer and use it in GitHub Desktop.
Save inkhorn/10217838 to your computer and use it in GitHub Desktop.
library(plyr)
library(ggplot2)
library(ggmap)
libraries = read.csv("ontario_library_stats_2010.csv")
libraries$isFN = ifelse(libraries$Library.Service.Type == "First Nations Library",1,0)
# Here we create the 'proportionate' versions of all the variables
libraries[,143:265] = sapply(libraries[,20:142], function (x) x/libraries[,13])
names(libraries)[143:265] = paste(names(libraries)[20:142], "P",sep=".")
libraries$cardholders.per.resident = libraries$X..of.Active.Library.Cardholders / libraries$Population..Resident.
libraries[,269:391] = sapply(libraries[,143:265], function (x) (x-min(x, na.rm=TRUE))/(max(x,na.rm=TRUE) - min(x,na.rm=TRUE)))
# here we gather a list of proportionate revenue vs expenditure data,
# figure out which columns show a significant group difference between
# Other libraries and First Nations Libraries, and then plot the medians
# from each group!
results.rev = c()
for (i in 23:50 + 123) {
results.rev = rbind(results.rev, data.frame(variable = names(libraries)[i],
position = i,
pval = kruskal.test(libraries[,i] ~ libraries$isFN)$p.value,
med.FN = median(libraries[which(libraries$isFN == 1),i+126], na.rm=TRUE),
med.OTHER = median(libraries[which(libraries$isFN == 0),i+126], na.rm=TRUE)))
}
results.rev = subset(results.rev, !(med.FN == 0 & med.OTHER == 0) & pval < .05)
ggplot(results.rev) + geom_point(aes(x=variable, y=med.FN), colour="red",size=6, alpha=.5) + geom_point(aes(x=variable, y=med.OTHER), colour="black",size=6, alpha=.5) + coord_flip() + scale_y_continuous(name="Normed Per-Resident Value \n(x-min(x)/max(x)-min(x))", limits=c(0,.25)) + scale_x_discrete(name="") + theme(axis.text.x=element_text(size=14, colour="black"), axis.text.y=element_text(size=14, colour="black"), axis.title.x=element_text(size=17), plot.title=element_text(size=17,face="bold")) + ggtitle("Costs and Revenues by Library Type\n(Red = First Nations, Black = Other)")
# Now for a whole whack of graphing!
ggplot(libraries, aes(factor(isFN, labels=c("Other Libraries", "First Nations Libraries")), cardholders.per.resident)) + geom_violin(fill="darkgreen") + scale_x_discrete(name="Library Type") + ggtitle("Cardholders per Resident Population by Library Type")
cardholders.by.FN = ddply(libraries, .(isFN.Library = factor(isFN, labels=c("Other Libraries", "First Nations Libraries"))), function (x) round(quantile(x$cardholders.per.resident, na.rm=TRUE),2))
libraries$rev.minus.cost = libraries$Total.Operating.Revenues.P - libraries$Total.Operating.Expenditures.P
ggplot(libraries, aes(factor(isFN, labels=c("Other Libraries", "First Nations Libraries")), rev.minus.cost)) + geom_violin(fill="darkgreen") + scale_x_discrete(name="Library Type") + scale_y_continuous(name="Net Profit per Resident Population", limits=c(-50,50) ) + ggtitle("Net Profit per Resident Population by Library Type\n(Zoomed in)")
netprofit.by.FN = ddply(libraries, .(isFN.Library = factor(isFN, labels=c("Other Libraries", "First Nations Libraries"))), function (x) round(quantile(x$rev.minus.cost, na.rm=TRUE),2))
ggplot(libraries, aes(x=factor(isFN, labels=c("Other Libraries", "First Nations Libraries")), y=libraries[,13])) + geom_violin(fill="darkgreen") + scale_x_discrete(name="Library Type") + scale_y_log10(name="Resident Population Size\n(log scaled)", breaks=c(25,50,100,250,500,1000,2500,5000,10000,25000,50000,100000,250000,500000, 1000000,2500000), labels=comma) + ggtitle("Distribution of Local Population Sizes by Library Type")
pop.by.FN = ddply(libraries, .(isFN.Library = factor(isFN, labels=c("Other Libraries", "First Nations Libraries"))), function (x) quantile(x$Population..Resident., na.rm=TRUE))
ggplot(libraries, aes(x=factor(isFN, labels=c("Other Libraries", "First Nations Libraries")), y=libraries[,66])) + geom_violin(fill="darkgreen") + scale_x_discrete(name="Library Type") + scale_y_log10(name="# English Titles in Circulation\n(log scaled)", breaks=c(25,50,100,250,500,1000,2500,5000,10000,25000,50000,100000,165000,250000,500000), labels=comma) + ggtitle("Distribution of English Titles in Circulation by Library Type")
raw.eng.titles.by.FN = ddply(libraries, .(isFN.Library = factor(isFN, labels=c("Other Libraries", "First Nations Libraries"))), function (x) quantile(x$Titles.Held..Circulating...English.Language., na.rm=TRUE))
ggplot(libraries, aes(x=factor(isFN, labels=c("Other Libraries", "First Nations Libraries")), y=libraries[,189])) + geom_violin(fill="darkgreen") + scale_x_discrete(name="Library Type") + scale_y_continuous(name="# English Titles in Circulation / Local Pop. Size") + ggtitle("Distribution of English Titles in Circulation per Resident by Library Type")
eng.titles.by.FN = ddply(libraries, .(isFN.Library = factor(isFN, labels=c("Other Libraries", "First Nations Libraries"))), function (x) quantile(x$Titles.Held..Circulating...English.Language..P, na.rm=TRUE))
ggplot(libraries, aes(x=factor(isFN, labels=c("Other Libraries", "First Nations Libraries")), y=libraries[,252])) + geom_violin(fill="darkgreen") + scale_x_discrete(name="Library Type") + scale_y_continuous(name="Annual Program Attendance / Local Pop. Size") + ggtitle("Distribution of Annual Program Attendance \nper Resident by Library Type")
prog.attendance.by.FN = ddply(libraries, .(isFN.Library = factor(isFN, labels=c("Other Libraries", "First Nations Libraries"))), function (x) quantile(x$Annual.program.attendance.P, na.rm=TRUE))
# Now for the mapping!
# zipcodeset taken from geocoder.ca
postals = read.csv("zipcodeset.txt", header=FALSE)
names(postals) = c("Postal", "Lat","Lon","City", "Prov")
libraries = merge(libraries, postals[,1:3], by.x="Postal.Code", by.y="Postal", all.x=TRUE)
libraries.fn = subset(libraries, isFN == 1)
ontario = qmap("ontario", zoom=5)
ontario + geom_point(aes(x=Lon, y=Lat, colour=rev.minus.cost), data=libraries.fn, alpha=.7, size=6) + scale_colour_continuous(low="red", high="green", name="Net Profit per \nResident Population") + ggtitle("Net Profit per Local Resident\nAmongst First Nations Libraries") + theme(plot.title=element_text(size=20,face="bold"))
ontario + geom_point(aes(x=Lon, y=Lat, colour=rev.minus.cost), data=subset(libraries.fn, rev.minus.cost < 0), alpha=.7, size=6) + scale_colour_continuous(low="darkred", high="pink", name="Net Profit per \nResident Population") + ggtitle("Net Profit per Local Resident\nAmongst First Nations Libraries\n(Only those with a net loss)") + theme(plot.title=element_text(size=20,face="bold"))
ontario + geom_point(aes(x=Lon, y=Lat, colour=Total.Operating.Revenues/Total.Operating.Expenditures), data=libraries.fn, alpha=.7, size=6) + scale_colour_continuous(low="red", high="green", name="Operating Revenue \nto Cost Ratio") + ggtitle("Operating Revenue to Cost Ratio\nAmongst First Nations Libraries") + theme(plot.title=element_text(size=20,face="bold"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment