Created
February 21, 2013 16:48
-
-
Save braintreeps/5006126 to your computer and use it in GitHub Desktop.
This is the R code to go along with the blog post "Mapping 35 Million Credit Cards On Top of Census Data With R".
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table); | |
library(maps); | |
library(maptools); | |
library(spatstat); | |
library(zipcode); | |
library(GISTools) | |
#load the zipcode dataset | |
data(zipcode); | |
########## READ AND PREPARE THE DATA ########### | |
#read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name | |
zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character")) | |
#remove missing zip codes (there are a very small number of these) | |
zips0<-zips0[!is.na(zips0$postal_code),] | |
#get rid of some internal cards | |
zips0<-zips0[!grepl("Braintree", zips0$company_name),] | |
#get rid of zip codes that have any letters in them | |
zips0<-zips0[!grepl("[A-Z|a-z]", zips0$postal_code),] | |
#for zips that are zip+4, just keep the first five digits | |
zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code)); | |
zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5) | |
#now get rid of anything that's left that's not a well-formed 5 digit zip | |
zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),]; | |
#and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues | |
#like foreign zips that could match onto a valid US zip for whatever reason | |
zips0<-zips0[is.na(zips0$country_name) || tolower(zips0$country_name)=="united states of america",] | |
#merge in the lat/long information from the zipcode dataset; from now on | |
#the "postal_code" column will be called "zip" because of the merge | |
mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code")) | |
#do some aggregation (count all cards in a zip) using a data.table, because it's much faster | |
mergedZipsDT<-data.table(mergedZips) | |
mergedZipsDT[, zipCount:=.N, by=list(zip)] | |
#keep only one row for each zip code | |
dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)]) | |
########## READ IN THE CENSUS ZCTA DATA ########### | |
#read in the zip to zcta mapping (crosswalk) file | |
zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character", 5))) | |
#read in the "gazeteer" file that has population, latitude, and longitude for the zcta's | |
gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8))) | |
#merge everything together | |
#the inner merge maps our card zip/count data onto the zcta's using the crosswalk file | |
#the outer merge maps zcta onto population and lat/long centroid using the gazeteer file | |
zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID")) | |
#since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta | |
zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs) | |
zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)] | |
#again, keep only one row for each zcta | |
dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)]) | |
#make a dataframe that will capture the card density type info that we want to plot | |
cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)]) | |
colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop") | |
cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop) | |
#get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up | |
cardDensity<-cardDensity[cardDensity$pop!=0,] | |
#require that there be fewer than one card per capita and | |
#only look at places that have 100 people or more to exclude weird outliers | |
#like the NSA, airports, Grand Central, etc. where nobody lives | |
cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,] | |
########## POINTS PLOT ########### | |
longitudeLimits<-c(-130, -56) | |
latitudeLimits<-c(15, 60) | |
#add some random jitter | |
longitudeJitterSize<-diff(range(longitudeLimits))/5000. | |
latitudeJitterSize<-diff(range(latitudeLimits))/5000. | |
longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize) | |
latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize) | |
png(filename="pointsmap.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1); | |
map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits); | |
points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001); | |
map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE); | |
dev.off() | |
########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ########### | |
#make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors | |
#when we try use use weights below | |
points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE); | |
png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1); | |
spatstat.options(npixel=c(1000,1000)); | |
densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]); | |
map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits); | |
my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb") | |
image(densitymap, col=my.palette(40), add=TRUE); | |
map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE); | |
dev.off() | |
########## CHOROPLETH MAP ########### | |
#read in our census ZCTA shapefiles; the shapes objects has various slots: | |
#shapes@polygons has the actual polygon boundarys | |
#shapes@data has a dataframe, one row for each polygon | |
#it's important to retain the alignment between these two | |
shapes<-readShapeSpatial("tl_2010_us_zcta510.shp") | |
#convert the zip from a factor to a character | |
shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10] | |
#store all of our original row numbers, since we'll need to match them up later | |
shapes@data$original_rownum<-rownames(shapes@data) | |
#merge in our cardDensity data frame | |
#********************************************************************************* | |
#NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!! | |
#(TRUST ME, I WASTED A LOT OF TIME ON THIS) | |
#ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE | |
#A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS | |
#AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED | |
#********************************************************************************* | |
merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE); | |
merged<-merged[order(as.numeric(merged$original_rownum)),] | |
#********************************************************************************* | |
#force the row names of the data frame to be the same as the ID's of the polygons, | |
#because, according to | |
#http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html, | |
#things will get re-ordered otherwise and our plot will get scrambled | |
#this will allow you to see the row names of the polygons: | |
#ids<-sapply(shapes@polygons, function(x){x@ID}) | |
#this will allow you to see the row names of the dataframe: | |
#dataids<-rownames(shapes@data) | |
#they have to match: identical(ids, dataids) | |
#********************************************************************************* | |
rownames(merged)<-merged$original_rownum | |
shapes@data<-merged | |
#restrict ourselves to the continental US | |
statesToRemove<-c("AK", "HI", "PR") | |
shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),] | |
#make ourselves a function for plotting various quantities | |
plotFullChoropleth<-function(quantity){ | |
#make a color palette | |
my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab") | |
#pick our variable to color from | |
if(quantity=="POP_DENSITY"){ | |
#POPULATION DENSITY | |
shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10)) | |
} else if(quantity=="RAW_POP"){ | |
#RAW POPULATION | |
shapes.plotvar<-shapes@data$pop | |
} else if(quantity=="TOT_AREA"){ | |
#TOTAL AREA | |
shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10) | |
} else if(quantity=="RAW_CARDS"){ | |
#RAW CARD COUNT | |
shapes.plotvar<-shapes@data$cardCount | |
} else if(quantity=="CARDS_PER_CAP"){ | |
#PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY) | |
shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop) | |
} else if(quantity=="CARD_DENSITY"){ | |
#CARD DENSITY | |
shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10)) | |
} else { | |
stop("Dont' know that quantity!!") | |
} | |
#ignore any zcta's with outlier data | |
shapes.plotvar[is.na(shapes.plotvar)]<-0 | |
shapes.plotvar[!is.finite(shapes.plotvar)]<-0 | |
nShades<-80 | |
shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1)) | |
par.settings=list(axis.line=list(col=NA)) | |
choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1) | |
map("state", col="white", lwd=1.0, add=TRUE); | |
} | |
png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.*960, height=5.*960, pointsize=1); | |
split.screen(c(2,2)) | |
screen(1) | |
plotFullChoropleth("CARD_DENSITY") | |
screen(2) | |
plotFullChoropleth("POP_DENSITY") | |
screen(3) | |
plotFullChoropleth("CARDS_PER_CAP") | |
dev.off() | |
close.screen(all=TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment