Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created January 23, 2016 13:42
Show Gist options
  • Save stephenturner/18340ce39e18c292ada6 to your computer and use it in GitHub Desktop.
Save stephenturner/18340ce39e18c292ada6 to your computer and use it in GitHub Desktop.
library(rvest)
library(dplyr)
library(readr)
# How many pages do you want to harvest (currently over 10,000 available)
npages <- 5
for (i in 1:npages) {
# If first time through the loop, initialize list to hold results
if (i==1) d <- list()
# Report progress
message(paste0("Harvesting page ", i, "/", npages))
# The URL for the table
url <- paste0("https://www.brewtoad.com/recipes?page=", i, "&sort=created_at&view_as_table=true")
# harvest the data
d[[i]] <- url %>%
read_html %>%
html_table %>%
as.data.frame %>%
tbl_df
}
rm(npages, i, url)
# bind_rows on the list to convert to a data_frame
d <- d %>%
bind_rows %>%
# Remove the rating column - sparse, most are zero
select(-Rating) %>%
# remove the percent sign and turn abv into a numeric value
mutate(ABV=as.numeric(gsub("%", "", ABV))) %>%
# remove dupes
distinct
# Write to file
d %>% write_csv("brewtoad.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment