Skip to content

Instantly share code, notes, and snippets.

@benjaminguinaudeau
Last active September 1, 2020 10:21
Show Gist options
  • Save benjaminguinaudeau/2db5647ac4691559ed0f5af6d67166ce to your computer and use it in GitHub Desktop.
Save benjaminguinaudeau/2db5647ac4691559ed0f5af6d67166ce to your computer and use it in GitHub Desktop.
A function to get around the youtube api. Scrape number of videos for each account ( up to 20000) without any API access
# video_counts <- c("UCtahKSp0CdvDv8CMS7RXsiQ", "UC3XTzVzaHQEd30rQbuvCtTQ", "UC-i2qb4sL10OdR6PyMEU_5Q",
# "UCaXkIU1QidjPwiAYu6GcHjg", "UCn8zNIfYAQNdrFRrr8oibKw") %>%
# purrr::map_dfr(get_accessible_videos)
get_accessible_videos <- function(channel_id){
data <- glue::glue("https://www.youtube.com/channel/{channel_id}/videos") %>%
xml2::read_html() %>%
rvest::html_nodes("script") %>%
.[28] %>%
rvest::html_text()
n <- stringr::str_count(data, "gridVideoRenderer")
if(n > 20){
playlist <- stringr::str_extract(data, '/playlist\\?.*?\"')
play <- glue::glue("https://www.youtube.com{playlist}") %>%
stringr::str_replace_all("\\\\u0026", "&") %>%
stringr::str_extract("(?<=list=).*?(?=&)")
n <- glue::glue("https://www.youtube.com/watch?v=rBu0BRTx2x8&list={play}&index=1") %>%
xml2::read_html() %>%
rvest::html_nodes("script") %>%
as.character %>%
stringr::str_subset("videoCountText") %>%
stringr::str_extract("videoCountText.{100}") %>%
stringr::str_extract('(?<=\\"\\}\\,\\{"text"\\:")\\d+') %>%
as.numeric
}
return(tibble::tibble(channel_id = channel_id, n_videos = n))
}
#
# parse_youtube_data <- function(yt_data){
# yt_data %>%
# str_remove('(.|\\s)*?(?=\\{"gridVideoRenderer")') %>%
# str_remove('(?<=\\]\\}\\})(\\]\\,"(trackingParams|continuations)")(.|\\s)+$') %>%
# str_split('\\{"gridVideoRenderer":') %>%
# .[[1]] %>%
# .[-1] %>%
# str_remove("\\}\\,?$") %>% #bashR::simule_map(30)
# imap_dfr(~{
# .x %>%
# jsonlite::fromJSON() %>%
# rlist::list.flatten() %>%
# imap_dfc(~{
# if(length(.x) > 1){
# a = list(.x)
# } else {
# a = .x
# }
#
# tibble(a) %>%
# set_names(.y)
# })
# })
#
# }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment