Created
January 6, 2016 13:45
-
-
Save expersso/aa8e113978b9c3dc47da to your computer and use it in GitHub Desktop.
Movie budgets and box office success (1955-2015)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(lubridate) | |
library(stringr) | |
library(rvest) | |
library(xml2) | |
library(OECD) | |
# Get budget and box office data | |
resp <- read_html("http://www.the-numbers.com/movie/budgets/all") | |
budget <- resp %>% | |
xml_find_all("//*[@id='page_filling_chart']/center/table") %>% | |
html_table(fill = TRUE) %>% | |
.[[1]] %>% | |
.[, -1] %>% | |
filter(complete.cases(.)) %>% | |
setNames(make.names(tolower(names(.)))) %>% | |
gather(variable, value, -release.date, -movie) %>% | |
mutate(release.date = as.Date(release.date, "%m/%d/%Y"), | |
value = as.numeric(str_replace_all(value, "[$,]", ""))) %>% | |
tbl_df() | |
# Get inflation data | |
infl <- get_dataset("MEI_PRICES", list("CPALTT", "USA", "IXOB", "M")) %>% | |
setNames(tolower(names(.))) %>% | |
select(obstime, price_level = obsvalue) %>% | |
mutate(obstime = as.Date(paste0(obstime, "-01"))) | |
# Bind together data frames, inflation-adjust data, drop NAs, take logs | |
# Excludes movies released before 1955, which is only a handful | |
bp <- budget %>% | |
mutate(release.date = floor_date(release.date, "month")) %>% | |
left_join(infl, by = c("release.date" = "obstime")) %>% | |
mutate(price_level = price_level / 100, | |
adj_value = value / price_level) %>% | |
filter(value != 0) %>% | |
filter(!duplicated(.)) %>% | |
select(-price_level, -value) %>% | |
spread(variable, adj_value) %>% | |
mutate(production.budget = log10(production.budget), | |
worldwide.gross = log10(worldwide.gross)) %>% | |
filter(!is.na(production.budget), !is.na(worldwide.gross)) | |
# Regression model | |
m1 <- lm(worldwide.gross ~ production.budget + I(production.budget^2), data = bp) | |
# Create data frame with text labels for largest and smallest residuals | |
model <- fortify(m1) | |
labs <- bp[order(model$.resid, decreasing = TRUE), ] | |
labs <- labs %>% slice(c(1:5, (n()-5):n())) # Top and bottom 5 residuals | |
labs <- labs %>% | |
rbind(bp %>% arrange(desc(worldwide.gross)) %>% head(3)) # Add top 3 by gross | |
# Add release year to title | |
labs$title_year <- sprintf(" %s (%s) ", labs$movie, year(labs$release.date)) | |
# Final plot | |
p <- ggplot(bp, aes(x = production.budget, y = worldwide.gross)) + | |
geom_point(alpha = 0.25, color = "steelblue") + | |
geom_point(data = labs, color = "firebrick") + | |
geom_line(aes(y = .fitted), data = model, color = "firebrick", size = 0.8) + | |
geom_text(aes(label = title_year), | |
data = labs, | |
color = "firebrick", hjust = "outward", vjust = "outward", | |
size = 2.5, lineheight=0.75) + | |
theme_light(8) + | |
scale_x_continuous(labels = scales::math_format(10^.x)) + | |
scale_y_continuous(labels = scales::math_format(10^.x)) + | |
annotation_logticks(color = "grey60") + | |
labs(x = "\nProduction budget (2010 prices)", | |
y = "Worldwide gross (2010 prices)\n", | |
title = "Movie budgets and box office success (1955-2015)") |
Author
expersso
commented
Jan 6, 2016
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment