Skip to content

Instantly share code, notes, and snippets.

@kbmorales
Last active February 19, 2018 20:24
Show Gist options
  • Save kbmorales/930981e3bd5ffdfced46256f97e88d74 to your computer and use it in GitHub Desktop.
Save kbmorales/930981e3bd5ffdfced46256f97e88d74 to your computer and use it in GitHub Desktop.
Sample code block for Matt Kiang
# Purpose: scrape metadata from random videos (packages not included)
URL <- "https://www.pornhub.com/gay/video/random"
# Initialize data list and vars for storage
randviddat.list = list()
i <- 1
maxvid <- 1000
# Iterator for scraping
for(i in 1:maxvid) {
Sys.sleep(runif(1,0,1)) # Sleep for 0 to 1 seconds
randurl <- HEAD(URL)[[1]] # Pull web address of random video
if (status_code(GET(randurl)) != 200) {
maxvid <- maxvid + 1 # Return if bad link
next
}
webpage <- read_html(randurl) # Read in HTML document for random URL
# Check if class of premiumLocked exists
if (length(html_attr(html_nodes(webpage, '.premiumLocked'), name = "class") != 0)) {
maxvid <- maxvid + 1 # Return if video is behind paywall
next
}
title <- html_nodes(webpage, '.inlineFree') %>%
html_text() # Pull title
views <- html_nodes(webpage, '.count') %>%
html_text() %>%
str_replace_all("[,]", "") %>%
as.numeric() # Pull and format viewcount
rating <- html_nodes(webpage, '.percent') %>%
html_text() %>%
str_replace("[%]", "") %>%
as.numeric() # Pull and format rating value
categories <- html_nodes(webpage, '.categoriesWrapper > a') %>%
html_text() %>%
str_c(collapse = ", ") # Pull official categories
production <- html_nodes(webpage, '.production') %>%
html_text() # Pull production type
tags <- html_nodes(webpage, '.tagsWrapper > a') %>%
html_text() %>%
str_c(collapse = ", ") # Pull and format video tags
added <- html_nodes(webpage, xpath = '/html/head/meta[9]') %>%
html_attr(name = "content") %>%
str_extract(pattern = trimpatdate) %>%
str_replace("/", "") # Pull and format dates added
viewkey <- html_nodes(webpage, xpath = '/html/head/link[4]') %>%
html_attr(name = "href") %>%
str_replace(pattern = trimpatvk, "") # Pull unique video identifier
# Save video data as dataframe in indexed list
randviddat.list[[i]] =
data.frame(title, views, rating, categories, production, tags, added, viewkey)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment