Last active
          December 8, 2022 23:27 
        
      - 
      
- 
        Save MattCowgill/5f283eb410116c9da32e166639b947c6 to your computer and use it in GitHub Desktop. 
    A function to scrape the full text of Reserve Bank of Australia monetary policy decisions
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | library(rvest) | |
| library(tidyverse) | |
| #' Scrape RBA monetary policy decision media releases in a tidy tibble | |
| #' @param min_year If `NULL` (the default), all releases will be scraped. If a | |
| #' year is specified (eg. `2015`), only releases from that year onwards will be | |
| #' scraped. | |
| #' @author Matt Cowgill | |
| #' @examples | |
| #' all_decisions <- scrape_monpol_decisions() | |
| #' covid_decisions <- scrape_monpol_decisions(2020) | |
| scrape_monpol_decisions <- function(min_year = NULL) { | |
| monpol_page <- read_html("https://www.rba.gov.au/monetary-policy/") | |
| monpol_year_url_fragments <- monpol_page |> | |
| html_elements("li:nth-child(5) li li a") |> | |
| html_attr("href") | |
| monpol_year_urls <- paste0("https://www.rba.gov.au", | |
| monpol_year_url_fragments) |> | |
| sort(decreasing = TRUE) | |
| if (!is.null(min_year)) { | |
| years <- gsub("https://www.rba.gov.au/monetary-policy/int-rate-decisions/|/", | |
| "", | |
| monpol_year_urls) |> | |
| as.numeric() | |
| monpol_year_urls <- monpol_year_urls[years >= min_year] | |
| } | |
| get_page_links <- function(url) { | |
| date_url_fragments <- read_html(url) |> | |
| html_elements(".list-articles a") |> | |
| html_attr("href") | |
| paste0("https://www.rba.gov.au", | |
| date_url_fragments) | |
| } | |
| get_text_from_mr <- function(url) { | |
| page <- url |> | |
| read_html() | |
| raw_text <- page |> | |
| html_elements("div.rss-mr-content") |> | |
| html_text2() | |
| if (length(raw_text) == 0) { | |
| raw_text <- page |> | |
| html_elements(".article-data+ div") |> | |
| html_text2() | |
| } | |
| date <- page |> | |
| html_elements("time") |> | |
| html_text() |> | |
| lubridate::dmy() | |
| if (length(date) == 0) { | |
| date <- page |> | |
| html_elements("#content > section > div > div.box-article-info.article-data > div:nth-child(2) > span.value") |> | |
| html_text() |> | |
| lubridate::dmy() | |
| } | |
| title <- page |> | |
| html_elements("span.rss-mr-title") |> | |
| html_text() | |
| if (date >= as.Date("2006-11-08")) { | |
| statement_by <- gsub(",.*", "", title) | |
| author <- gsub("Statement by |Statement By", "", statement_by) | |
| } else { | |
| statement_by <- gsub(":.*", "", title) | |
| author <- gsub("Statement by the Governor, Mr ", "", statement_by) | |
| } | |
| text <- gsub("\r|\n", " ", raw_text) |> | |
| str_squish() | |
| tibble(date = date, | |
| author = author, | |
| text = text) | |
| } | |
| page_links_list <- map(monpol_year_urls, get_page_links) | |
| page_links_long <- unlist(page_links_list) | |
| page_links <- page_links_long[page_links_long != "https://www.rba.gov.au"] | |
| monpol_decisions <- map_dfr(page_links, get_text_from_mr) | |
| return(monpol_decisions) | |
| } | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment