Created
March 10, 2015 05:31
-
-
Save kiyoto/93bb63a19c50036aa308 to your computer and use it in GitHub Desktop.
Script for Strata Hadoop 2015 Reviews Data Collection + Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(dplyr) | |
library(scrapeR) | |
strata_speakers <- "http://strataconf.com/big-data-conference-ca-2015/public/schedule/speakers" | |
speaker_page <- | |
scrape(url=strata_speakers, | |
parse=T, headers=T) | |
href <- xpathSApply(speaker_page[[strata_speakers]], "//a/@href") | |
href <- unique(href) | |
href <- href[grepl('^/big-data-conference-ca-2015/public/schedule/detail/\\d+', href)] | |
href <- paste("http://strataconf.com", href, sep="") | |
# Helper method to scrape the data from individual URL | |
get_talk_metadata <- function(url) { | |
html <- scrape(url=url, parse=T, headers=T) | |
time <- xpathSApply(html[[url]], | |
"//div[@class='session_time']/text()", xmlValue)[[1]] | |
time <- gsub('[\\t\\r\\n]*', '', time, perl=T) | |
date <- strsplit(time, ", ")[[1]][[2]] | |
time <- strsplit(time, "m")[[1]][[1]] | |
talk_title <- xpathSApply(html[[url]], '//h1[@class="summary"]/text()', xmlValue)[[1]] | |
talk_location <- xpathSApply(html[[url]], '//span[@class="location"]/text()', xmlValue)[[1]] | |
category <- xpathSApply(html[[url]], '//span[@class="en_session_topics category"]/text()', xmlValue)[[1]] | |
category <- gsub('[\\t\\r\\n]*', '', category, perl=T) | |
rating_string <- xpathSApply(html[[url]], | |
"//div[@class='en_grade_average_detail']/text()", xmlValue) | |
if (is.null(rating_string)) { | |
avg_point <- NA | |
num_reviews <- 0 | |
} else { | |
rating_string <- rating_string[[1]] | |
rating_string <- regmatches(rating_string, regexpr("[\\d\\.]+, \\d+", rating_string, perl=T)) | |
s <- as.numeric(strsplit(rating_string, ", ")[[1]]) | |
avg_point <- s[[1]] | |
num_reviews <- s[[2]] | |
} | |
list(avg_point=avg_point, num_reviews=num_reviews, | |
time=time, date=date, title=talk_title, | |
location=talk_location, category=category) | |
} | |
# initialize data frame columns | |
avg_points <- c() | |
num_reviews <- c() | |
time <- c() | |
date <- c() | |
title <- c() | |
location <- c() | |
category <- c() | |
# Get all the data | |
for (link in href) { | |
metadata <- get_talk_metadata(link) | |
avg_points <- c(avg_points, metadata$avg_point) | |
num_reviews <- c(num_reviews, metadata$num_reviews) | |
time <- c(time, metadata$time) | |
date <- c(date, metadata$date) | |
title <- c(title, metadata$title) | |
location <- c(location, metadata$location) | |
category <- c(category, metadata$category) | |
Sys.sleep(1) | |
} | |
strata2015_talks <- data.frame(list(avg_points=avg_points, | |
num_reviews=num_reviews, | |
time=time, | |
date=date, | |
title=title, | |
location=location, | |
category=category)) | |
# Data is ready! | |
# Mark sponsored | |
p$sponsored <- grepl("sponsor", tolower(p$category)) | |
# scatter plot | |
ggplot(strata2015_talks, aes(x=avg_points, y=num_reviews)) + | |
geom_point(size=4, color="#b11113", aes(color=clusters$cluster)) + | |
ggtitle("# of Reviews v. Average Points") + | |
theme(plot.title = element_text(size=24, vjust=1.8)) + | |
annotate("rect", xmin=4, xmax=5, ymin=30, ymax=45, alpha=0.3) + | |
annotate("text", label="Many good reviews", x=4.5, y=40, size=8) + | |
annotate("rect", xmin=2, xmax=2.7, ymin=30, ymax=40, alpha=0.3) + | |
annotate("text", label="Many bad reviews", x = 2.35, y = 37, size=8) | |
p<-strata2015_talks %>% | |
filter(!is.na(avg_points)) %>% | |
group_by(category) %>% | |
summarise(avg_points = stats::weighted.mean(avg_points,num_reviews), num_reviews=sum(num_reviews)) %>% | |
arrange(desc(avg_points)) | |
p$overall_avg_points <- stats::weighted.mean(avg_points, num_reviews) | |
# Shrink to the average to account for sample size | |
# c.f. http://stats.stackexchange.com/questions/15979/how-to-find-confidence-intervals-for-ratings/16053#16053 | |
p$adjusted_avg_points <- with(p, avg_points*num_reviews/(num_reviews+1)+overall_avg_points/(num_reviews+1)) | |
# The plot function | |
ggplot(p)+ | |
geom_bar(aes(x=reorder(category, adjusted_avg_points), | |
y=adjusted_avg_points, fill=sponsored), | |
stat="identity") + | |
coord_flip() + | |
xlab("category") + | |
theme(plot.title=element_text(size=24, vjust=1.8), | |
axis.text=element_text(color="#000000"))+ | |
ggtitle("Strata Hadoop 2015 Average Ratings per Category")+ | |
scale_fill_manual(values=c("#777777","#B11113"))+ | |
geom_hline(yintercept=3.75, color="#005000", size=2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment