Skip to content

Instantly share code, notes, and snippets.

@expersso
Created January 6, 2016 13:45
Show Gist options
  • Save expersso/aa8e113978b9c3dc47da to your computer and use it in GitHub Desktop.
Save expersso/aa8e113978b9c3dc47da to your computer and use it in GitHub Desktop.
Movie budgets and box office success (1955-2015)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(stringr)
library(rvest)
library(xml2)
library(OECD)
# Get budget and box office data
resp <- read_html("http://www.the-numbers.com/movie/budgets/all")
budget <- resp %>%
xml_find_all("//*[@id='page_filling_chart']/center/table") %>%
html_table(fill = TRUE) %>%
.[[1]] %>%
.[, -1] %>%
filter(complete.cases(.)) %>%
setNames(make.names(tolower(names(.)))) %>%
gather(variable, value, -release.date, -movie) %>%
mutate(release.date = as.Date(release.date, "%m/%d/%Y"),
value = as.numeric(str_replace_all(value, "[$,]", ""))) %>%
tbl_df()
# Get inflation data
infl <- get_dataset("MEI_PRICES", list("CPALTT", "USA", "IXOB", "M")) %>%
setNames(tolower(names(.))) %>%
select(obstime, price_level = obsvalue) %>%
mutate(obstime = as.Date(paste0(obstime, "-01")))
# Bind together data frames, inflation-adjust data, drop NAs, take logs
# Excludes movies released before 1955, which is only a handful
bp <- budget %>%
mutate(release.date = floor_date(release.date, "month")) %>%
left_join(infl, by = c("release.date" = "obstime")) %>%
mutate(price_level = price_level / 100,
adj_value = value / price_level) %>%
filter(value != 0) %>%
filter(!duplicated(.)) %>%
select(-price_level, -value) %>%
spread(variable, adj_value) %>%
mutate(production.budget = log10(production.budget),
worldwide.gross = log10(worldwide.gross)) %>%
filter(!is.na(production.budget), !is.na(worldwide.gross))
# Regression model
m1 <- lm(worldwide.gross ~ production.budget + I(production.budget^2), data = bp)
# Create data frame with text labels for largest and smallest residuals
model <- fortify(m1)
labs <- bp[order(model$.resid, decreasing = TRUE), ]
labs <- labs %>% slice(c(1:5, (n()-5):n())) # Top and bottom 5 residuals
labs <- labs %>%
rbind(bp %>% arrange(desc(worldwide.gross)) %>% head(3)) # Add top 3 by gross
# Add release year to title
labs$title_year <- sprintf(" %s (%s) ", labs$movie, year(labs$release.date))
# Final plot
p <- ggplot(bp, aes(x = production.budget, y = worldwide.gross)) +
geom_point(alpha = 0.25, color = "steelblue") +
geom_point(data = labs, color = "firebrick") +
geom_line(aes(y = .fitted), data = model, color = "firebrick", size = 0.8) +
geom_text(aes(label = title_year),
data = labs,
color = "firebrick", hjust = "outward", vjust = "outward",
size = 2.5, lineheight=0.75) +
theme_light(8) +
scale_x_continuous(labels = scales::math_format(10^.x)) +
scale_y_continuous(labels = scales::math_format(10^.x)) +
annotation_logticks(color = "grey60") +
labs(x = "\nProduction budget (2010 prices)",
y = "Worldwide gross (2010 prices)\n",
title = "Movie budgets and box office success (1955-2015)")
@expersso
Copy link
Author

expersso commented Jan 6, 2016

movies

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment