Last active
February 19, 2018 20:24
-
-
Save kbmorales/930981e3bd5ffdfced46256f97e88d74 to your computer and use it in GitHub Desktop.
Sample code block for Matt Kiang
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Purpose: scrape metadata from random videos (packages not included) | |
URL <- "https://www.pornhub.com/gay/video/random" | |
# Initialize data list and vars for storage | |
randviddat.list = list() | |
i <- 1 | |
maxvid <- 1000 | |
# Iterator for scraping | |
for(i in 1:maxvid) { | |
Sys.sleep(runif(1,0,1)) # Sleep for 0 to 1 seconds | |
randurl <- HEAD(URL)[[1]] # Pull web address of random video | |
if (status_code(GET(randurl)) != 200) { | |
maxvid <- maxvid + 1 # Return if bad link | |
next | |
} | |
webpage <- read_html(randurl) # Read in HTML document for random URL | |
# Check if class of premiumLocked exists | |
if (length(html_attr(html_nodes(webpage, '.premiumLocked'), name = "class") != 0)) { | |
maxvid <- maxvid + 1 # Return if video is behind paywall | |
next | |
} | |
title <- html_nodes(webpage, '.inlineFree') %>% | |
html_text() # Pull title | |
views <- html_nodes(webpage, '.count') %>% | |
html_text() %>% | |
str_replace_all("[,]", "") %>% | |
as.numeric() # Pull and format viewcount | |
rating <- html_nodes(webpage, '.percent') %>% | |
html_text() %>% | |
str_replace("[%]", "") %>% | |
as.numeric() # Pull and format rating value | |
categories <- html_nodes(webpage, '.categoriesWrapper > a') %>% | |
html_text() %>% | |
str_c(collapse = ", ") # Pull official categories | |
production <- html_nodes(webpage, '.production') %>% | |
html_text() # Pull production type | |
tags <- html_nodes(webpage, '.tagsWrapper > a') %>% | |
html_text() %>% | |
str_c(collapse = ", ") # Pull and format video tags | |
added <- html_nodes(webpage, xpath = '/html/head/meta[9]') %>% | |
html_attr(name = "content") %>% | |
str_extract(pattern = trimpatdate) %>% | |
str_replace("/", "") # Pull and format dates added | |
viewkey <- html_nodes(webpage, xpath = '/html/head/link[4]') %>% | |
html_attr(name = "href") %>% | |
str_replace(pattern = trimpatvk, "") # Pull unique video identifier | |
# Save video data as dataframe in indexed list | |
randviddat.list[[i]] = | |
data.frame(title, views, rating, categories, production, tags, added, viewkey) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment