Skip to content

Instantly share code, notes, and snippets.

@MattCowgill
Last active December 8, 2022 23:27
Show Gist options
  • Save MattCowgill/5f283eb410116c9da32e166639b947c6 to your computer and use it in GitHub Desktop.
Save MattCowgill/5f283eb410116c9da32e166639b947c6 to your computer and use it in GitHub Desktop.
A function to scrape the full text of Reserve Bank of Australia monetary policy decisions
library(rvest)
library(tidyverse)
#' Scrape RBA monetary policy decision media releases in a tidy tibble
#' @param min_year If `NULL` (the default), all releases will be scraped. If a
#' year is specified (eg. `2015`), only releases from that year onwards will be
#' scraped.
#' @author Matt Cowgill
#' @examples
#' all_decisions <- scrape_monpol_decisions()
#' covid_decisions <- scrape_monpol_decisions(2020)
scrape_monpol_decisions <- function(min_year = NULL) {
monpol_page <- read_html("https://www.rba.gov.au/monetary-policy/")
monpol_year_url_fragments <- monpol_page |>
html_elements("li:nth-child(5) li li a") |>
html_attr("href")
monpol_year_urls <- paste0("https://www.rba.gov.au",
monpol_year_url_fragments) |>
sort(decreasing = TRUE)
if (!is.null(min_year)) {
years <- gsub("https://www.rba.gov.au/monetary-policy/int-rate-decisions/|/",
"",
monpol_year_urls) |>
as.numeric()
monpol_year_urls <- monpol_year_urls[years >= min_year]
}
get_page_links <- function(url) {
date_url_fragments <- read_html(url) |>
html_elements(".list-articles a") |>
html_attr("href")
paste0("https://www.rba.gov.au",
date_url_fragments)
}
get_text_from_mr <- function(url) {
page <- url |>
read_html()
raw_text <- page |>
html_elements("div.rss-mr-content") |>
html_text2()
if (length(raw_text) == 0) {
raw_text <- page |>
html_elements(".article-data+ div") |>
html_text2()
}
date <- page |>
html_elements("time") |>
html_text() |>
lubridate::dmy()
if (length(date) == 0) {
date <- page |>
html_elements("#content > section > div > div.box-article-info.article-data > div:nth-child(2) > span.value") |>
html_text() |>
lubridate::dmy()
}
title <- page |>
html_elements("span.rss-mr-title") |>
html_text()
if (date >= as.Date("2006-11-08")) {
statement_by <- gsub(",.*", "", title)
author <- gsub("Statement by |Statement By", "", statement_by)
} else {
statement_by <- gsub(":.*", "", title)
author <- gsub("Statement by the Governor, Mr ", "", statement_by)
}
text <- gsub("\r|\n", " ", raw_text) |>
str_squish()
tibble(date = date,
author = author,
text = text)
}
page_links_list <- map(monpol_year_urls, get_page_links)
page_links_long <- unlist(page_links_list)
page_links <- page_links_long[page_links_long != "https://www.rba.gov.au"]
monpol_decisions <- map_dfr(page_links, get_text_from_mr)
return(monpol_decisions)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment