Created
September 24, 2012 13:09
-
-
Save cbare/3775873 to your computer and use it in GitHub Desktop.
An exercise in data manipulation from chapter 1 of Machine Learning for Hackers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Cbare's code from Machine Learning for Hackers | |
## Chapter 1 - cleaning up data on UFO sightings | |
## | |
## for data files and lot's more R code, see: | |
## https://github.com/johnmyleswhite/ML_for_Hackers | |
## | |
############################################################ | |
library(ggplot2) | |
library(scales) | |
# read data on UFO sightings from infochimps | |
ufo<-read.delim("data/ufo/ufo_awesome.tsv", | |
sep="\t", stringsAsFactors=FALSE, | |
header=FALSE, na.strings="") | |
# give the columns nice names | |
names(ufo)<-c("DateOccurred", | |
"DateReported", | |
"Location", | |
"ShortDescription", | |
"Duration", | |
"LongDescription") | |
head(ufo[ nchar(ufo$DateOccurred)!=8, ]) | |
# there are bad date fields, so drop those rows | |
bad.rows <- nchar(ufo$DateOccurred)!=8 | nchar(ufo$DateReported)!=8 | |
ufo <- ufo[!bad.rows,] | |
# convert strings to dates | |
ufo$DateOccurred<-as.Date(ufo$DateOccurred, format="%Y%m%d") | |
ufo$DateReported<-as.Date(ufo$DateReported, format="%Y%m%d") | |
# parse out city and state from location, where possible | |
# for US cities and states, Location takes the form "Seattle, WA" | |
loc.split <- strsplit(ufo$Location,",\\s*") | |
# Look for state in the last position | |
ufo$state <- toupper(unlist( lapply(loc.split, function(x) x[length(x)]))) | |
# shove everything else in the city column | |
ufo$city <- unlist( lapply(loc.split, function(x) paste(x[-length(x)],collapse=","))) | |
# keep just the rows that seem to be located in US states | |
# using the built-in "state" data sets. | |
ufo.us <- ufo[ufo$state %in% state.abb,] | |
# read a file with state populations from 2011, 2012, and 2000 | |
state.pop <- read.csv('data/census.csv')[, c('State', 'X2000')] | |
colnames(state.pop) <- c('name','pop.2000') | |
# Let's use 2000 populations, and also tack on abbreviations | |
# so we can merge, later | |
state.pop$abbrev <- state.abb[ sapply(state.pop$name, function(s) which(state.name==s)) ] | |
# count sightings by US state | |
sightings.by.state <- as.data.frame(table(ufo.us$state)) | |
colnames(sightings.by.state) <- c('state', 'sightings') | |
# combine UFO sightings data with state population | |
sightings.by.state <- merge(sightings.by.state, state.pop, by.x='state', by.y='abbrev') | |
# compute per.capita UFO sightings | |
sightings.by.state <- transform(sightings.by.state, state=state, state.name=name, sightings=sightings, sightings.per.cap=sightings/pop.2000) | |
sightings.by.state <- sightings.by.state[ order(sightings.by.state$sightings.per.cap, decreasing=T), ] | |
# make a nice boxplot | |
boxdata <- boxplot(sightings.by.state$sightings.per.cap, | |
ylab="sightings per capita", | |
col="#66FF5580") | |
title("UFO sightings by state") | |
for(i in 1:length(boxdata$group)){ | |
#add text to the boxplot | |
text(boxdata$group[i], | |
boxdata$out[i], | |
sightings.by.state$state[ which(sightings.by.state$sightings.per.cap==boxdata$out[i]) ], | |
pos=4, cex=0.5, col="#990000CC") | |
} | |
# play with ggplot a bit to see if we can't make a nicer plot | |
p <- ggplot(data=sightings.by.state, | |
aes(x=factor(0), y=sightings.per.cap), | |
scale_x_discrete(breaks=NA)) | |
p <- p + geom_boxplot(aes(group=factor(0), fill="#66FF5580"), alpha=0.6, outlier.size=0) | |
p <- p + scale_fill_manual(values = alpha(c("#66FF5580"), .3), guide=FALSE) | |
p <- p + opts(title = "UFO sightings per capita for US States") | |
p <- p + ylab("Sightings per capita") | |
p <- p + xlab(NULL) | |
p <- p + opts(axis.title.y = theme_text(size = 14, colour = 'red')) | |
p <- p + theme(axis.ticks = element_blank(), axis.text.x = element_blank()) | |
p <- p + geom_point(aes(x=factor(0), y=sightings.per.cap), | |
data = sightings.by.state, colour=alpha("blue", 0.33), size=3) | |
p <- p + geom_text(data=sightings.by.state[3:15,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=3, colour=alpha('red', 0.6)) | |
p <- p + geom_text(data=sightings.by.state[1,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=5, colour=alpha('red', 0.9)) | |
p <- p + geom_text(data=sightings.by.state[2,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2.5, size=4, colour=alpha('red', 0.9)) | |
print(p) | |
ggsave(plot=p, filename='UFO sightings per captia.png', width=5,height=6) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Fixed a bunch of late-night coding errors. I think this works with a clean environment, if you're working directory is set to chapter 1 of the code for Machine Learning for Hackers.