Skip to content

Instantly share code, notes, and snippets.

@allaway
Created May 19, 2023 00:28
Show Gist options
  • Select an option

  • Save allaway/110287ca41e756e4a77e99fc161065cb to your computer and use it in GitHub Desktop.

Select an option

Save allaway/110287ca41e756e4a77e99fc161065cb to your computer and use it in GitHub Desktop.
lazy script to scrape from addgene
library(rvest)
library(dplyr)
library(stringr)
plasmids <- c("Addgene_83189", "Addgene_83188", "Addgene_83187", "Addgene_83186", "Addgene_83185", "Addgene_83184", "Addgene_83183", "Addgene_83182", "Addgene_83181", "Addgene_83180", "Addgene_83179", "Addgene_83178", "Addgene_83177", "Addgene_83176", "Addgene_83175", "Addgene_83174", "Addgene_83173", "Addgene_83172", "Addgene_83171", "Addgene_83170", "Addgene_83169", "Addgene_83168", "Addgene_83167", "Addgene_83166", "Addgene_83165", "Addgene_83164", "Addgene_83163", "Addgene_83162", "Addgene_83161", "Addgene_83160", "Addgene_83159", "Addgene_83158", "Addgene_83157", "Addgene_83156", "Addgene_83155", "Addgene_83154", "Addgene_83153", "Addgene_83152", "Addgene_83151", "Addgene_83150", "Addgene_83149", "Addgene_83148", "Addgene_83147", "Addgene_83146", "Addgene_83145", "Addgene_83144", "Addgene_83143", "Addgene_83142", "Addgene_83141", "Addgene_83140", "Addgene_83139", "Addgene_83138", "Addgene_83137", "Addgene_83136", "Addgene_83135", "Addgene_83134", "Addgene_83133", "Addgene_83132", "Addgene_83131", "Addgene_83130", "Addgene_83129")
res <- sapply(plasmids, function(x){
plasmid_id <- stringr::str_remove(x, "Addgene_")
page <- rvest::read_html(glue::glue('https://www.addgene.org/{plasmid_id}/'))
fields <- page %>% html_nodes(".field-label") %>% html_text()
field_contents <- page %>% html_nodes(".field") %>% html_text() %>%
str_remove_all('\\n') %>% str_remove_all(" ") %>%
str_remove_all(paste(fields, collapse = "|")) %>%
unique() %>%
setNames(fields) %>%
bind_rows
})
res_2 <- bind_rows(res)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment