Skip to content

Instantly share code, notes, and snippets.

@HarlanH
Created September 26, 2011 12:44
Show Gist options
  • Save HarlanH/1242145 to your computer and use it in GitHub Desktop.
Save HarlanH/1242145 to your computer and use it in GitHub Desktop.
Data Science DC Titles Visualization
# Data Science DC Titles Visualization
# Here's how this will work. In a main loop, a parameterized visualization function
# is called every N seconds. Each function gets the source spreadsheet fresh, and
# generates a visual.
# aspects of this code borrowed from Drew Conway:
# https://raw.github.com/drewconway/ZIA/master/R/better_word_cloud/better_word_cloud.R
library(plyr)
library(ggplot2)
library(tm)
options(stringsAsFactors=FALSE)
loop.time <- 15
source.data.url <- 'https://docs.google.com/spreadsheet/pub?hl=en_US&hl=en_US&key=0AnaXKp9bt6OXdEhYWmFocmgwU1RBa01qX0ttZ0JZaVE&single=true&gid=0&output=csv'
optimal.spacing<-function(spaces) {
if(spaces>1) {
spacing<-1/spaces
if(spaces%%2 > 0) {
lim<-spacing*floor(spaces/2)
return(seq(-lim,lim,spacing))
}
else {
lim<-spacing*(spaces-1)
return(seq(-lim,lim,spacing*2))
}
}
else {
return(0)
}
}
plot.function <- function(column, col.value, title) {
temporaryFile <- tempfile()
download.file(url=source.data.url,destfile=temporaryFile, method="curl")
dat <- read.csv(temporaryFile)
names(dat) <- c('Timestamp', 'Title', 'DataScientist', 'Sector', 'Education', 'Training')
# make a DT matrix
titles.corpus <- Corpus(DataframeSource(subset(dat, select=c('Title'))))
titles.matrix <- TermDocumentMatrix(titles.corpus, control=list(stopwords=stopwords(), removeNumbers=TRUE, removePunctuation=TRUE))
titles.matrix.df <- as.data.frame(inspect(titles.matrix))
yes.cols <- grepl(col.value, dat[,column])
words.yes <- rowSums(titles.matrix.df[,yes.cols])
words.no <- rowSums(titles.matrix.df[,!yes.cols])
words.diff <- data.frame(words=names(words.yes), freq=words.yes+words.no, count.diff=words.yes-words.no)
spacing <- sapply(table(words.diff$count.diff), optimal.spacing)
words.df <- ddply(words.diff, .(count.diff), function(cw) {
cbind(cw, ypos=unlist(spacing[as.character(cw$count.diff[[1]])]))
})
min.count <- pmin(-.1, min(words.df$count.diff))
max.count <- pmax(.1, max(words.df$count.diff))
wc <- ggplot(words.df, aes(count.diff, ypos, label=words, size=freq, colour=count.diff)) +
geom_text() +
scale_size(to=c(3,11), name='Word Frequency') +
scale_colour_gradient2(low='darkred', mid='black', high='darkblue', midpoint=0, legend=FALSE) +
scale_x_continuous('', breaks=c(min.count, 0, max.count),
labels=c('Less', 'Same', 'More')) +
scale_y_continuous('', breaks=c(0), labels='') +
coord_cartesian(xlim=c(min.count*1.2, max.count*1.2)) +
theme_bw() +
opts(panel.grid.major=theme_blank(),panel.grid.minor=theme_blank(),
title=title)
print(wc)
}
plots <- data.frame(column=c('DataScientist', 'Sector', 'Sector', 'Sector',
'Education', 'Education', 'Training',
'Training', 'Training', 'Training'),
col.value=c('Yes', 'Private', 'Public', 'Academic',
'Masters', 'Doctoral', 'Statistics',
'Machine Learning', 'Sciences', 'Business'),
title=c('Data Scientist = Yes', 'Private Sector', 'Public Sector', 'Academia',
'Masters Degree', 'PhD', 'Statistics Training',
'ML Training', 'Science Training', 'Business Training'))
row=1
while(1){
do.call(plot.function, as.list(plots[row, ]))
Sys.sleep(loop.time)
row = (row + 1)
if (row > nrow(plots)) row <- 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment