Created
May 7, 2019 20:31
-
-
Save MattSandy/e5b02eeccc0a2ad296e002d180846719 to your computer and use it in GitHub Desktop.
IMDB scraping for Avengers:Endgame reviews
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(magrittr) | |
library(stringr) | |
library(lubridate) | |
library(data.table) | |
library(ggplot2) | |
library(patchwork) | |
library(tidyverse) | |
# Functions --------------------------------------------------------------- | |
get_reviews <- function(paginationKey) { | |
url <- paste0("https://www.imdb.com/title/tt4154796/reviews/_ajax?sort=submissionDate&dir=asc&ref_=undefined&paginationKey=",paginationKey) | |
webpage <- read_html(url) | |
# Get Ratings ------------------------------------------------------------- | |
review_rating <- webpage %>% html_nodes(".imdb-user-review") %>% lapply(function(node) { | |
rating <- node %>% html_nodes("span.rating-other-user-rating") %>% | |
html_text() %>% str_match("([0-9]{1,2})\\/") %>% .[,2] %>% | |
as.numeric %>% unlist | |
if(length(rating)<1) { | |
return(-1) | |
} else { | |
return(rating) | |
} | |
}) %>% unlist | |
review_date <- webpage %>% html_nodes("div.display-name-date span.review-date") %>% | |
html_text() %>% dmy | |
review_text <- webpage %>% html_nodes("div.content div.text") %>% | |
html_text() | |
review_author <- webpage %>% html_nodes("span.display-name-link a") %>% | |
html_text() | |
review_title <- webpage %>% html_nodes("div.lister-item-content a.title") %>% | |
html_text() | |
review_id <- webpage %>% html_nodes(".imdb-user-review") %>% html_attr("data-review-id") | |
df <- data.frame(Rating = review_rating, | |
Date = review_date, | |
Author = review_author, | |
Title = review_title, | |
Text = review_text, | |
ID = review_id) | |
list(paginationKey = xml_attrs(xml_child(xml_child(xml_child(webpage, 1), 1), 2))[["data-key"]], | |
df = df) %>% return | |
} | |
# Loop -------------------------------------------------------------------- | |
reviews <- list() | |
paginationKey <- "" | |
for(i in 1:1000) { | |
print(i) | |
result <- get_reviews(paginationKey) | |
reviews[[i]] <- result$df | |
paginationKey <- result$paginationKey | |
} | |
df <- rbindlist(reviews) | |
df <- df[which(!duplicated(df$ID)),] | |
df <- df[which(df$Rating>0),] | |
df.counts <- expand.grid(Date = unique(df$Date),Rating = unique(df$Rating)) | |
df.counts$cumulative <- apply(df.counts,1,function(row){ | |
row %>% print | |
row[["Rating"]] %>% print | |
as.numeric(row[["Rating"]]) %>% print | |
# df %>% | |
# group_by(Rating,Date) %>% summarise(count = n()) %>% | |
# filter(Rating == 10 & Date <= "2019-05-02") %>% .["count"] %>% sum | |
counts <- df %>% | |
group_by(Rating,Date) %>% summarise(count = n()) %>% | |
filter(Rating == as.numeric(row[["Rating"]]) & Date <= row[["Date"]]) %>% | |
.["count"] | |
if(!length(counts)>0) { | |
return(0) | |
} else { | |
return(sum(counts)) | |
} | |
}) | |
df.counts$Rating <- factor(df.counts$Rating, levels=10:1) | |
df$Rating <- factor(df$Rating, levels=10:1) | |
df$Rating | |
pos1.A <- (df.counts %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))<10) %>% .["cumulative"] %>% sum) | |
pos1.B <- (df.counts %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))==10) %>% .["cumulative"] %>% sum) | |
pos1 <- (pos1.A / (pos1.A + pos1.B)) + (pos1.B / (pos1.A + pos1.B) / 2) | |
p1 <- ggplot(df.counts,aes(fill=Rating)) + | |
geom_area(aes(x = Date, y = cumulative),position = "fill",stat = "identity") + | |
scale_fill_viridis_d(name = "Rating",direction = -1) + theme_minimal() + | |
scale_y_continuous(breaks = c(0,.25,.5,.75,1),labels = c("0%","25%","50%","75%","100%")) + | |
guides(fill=guide_legend(ncol=1)) + | |
scale_x_date(date_breaks = "2 days") + | |
theme(legend.position = "right",legend.direction = "vertical") + | |
ggtitle("Avengers: Endgame User Ratings by Date") + | |
labs(subtitle = "Data Source: https://www.imdb.com/title/tt4154796/") + | |
ylab("Cumulative Rating Percent by Date") + | |
xlab("") + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + | |
geom_vline(xintercept=as.Date("2019-04-26"), linetype="dashed", color = "red") + | |
geom_text(aes(x=as.Date("2019-04-26"), label="Release Date\n", y=pos1), colour="black", angle=90) | |
pos2 <- (df %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))<10) %>% nrow) + | |
(df %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))==10) %>% nrow()/2) | |
p2 <- ggplot(df,aes(fill=Rating)) + | |
geom_bar(aes(x = Date)) + | |
scale_fill_viridis_d(name = "Rating",direction = -1) + theme_minimal() + | |
guides(fill=guide_legend(ncol=1)) + | |
theme(legend.position = "none") + | |
ggtitle("") + | |
labs(subtitle = "") + | |
ylab("Rating Count by Date") + | |
xlab("") + | |
scale_x_date(date_breaks = "2 days") + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + | |
geom_vline(xintercept=as.Date("2019-04-26"), linetype="dashed", color = "red") + | |
geom_text(aes(x=as.Date("2019-04-26"), label="Release Date\n", y=pos2), colour="black", angle=90) | |
ggsave(filename = "plot.png",p1 + p2,width = 11,units = "in") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment