kbmorales · February 19, 2018 20:24
diff --git a/scrape_example.R b/scrape_example.R
 # Purpose: scrape metadata from random videos (packages not included)
 URL <- "https://www.pornhub.com/gay/video/random"

 # Initialize data list and vars for storage
 randviddat.list = list()
 i <- 1
 maxvid <- 1000

 # Iterator for scraping
 for(i in 1:maxvid) {    
  Sys.sleep(runif(1,0,1))                   # Sleep for 0 to 1 seconds
  randurl <- HEAD(URL)[[1]]                 # Pull web address of random video
  if (status_code(GET(randurl)) != 200) { 
    maxvid <- maxvid + 1 # Return if bad link
    next
  }
  webpage <- read_html(randurl)             # Read in HTML document for random URL
  # Check if class of premiumLocked exists
  if (length(html_attr(html_nodes(webpage, '.premiumLocked'), name = "class") != 0)) {
    maxvid <- maxvid + 1                    # Return if video is behind paywall
    next
  }
  title <- html_nodes(webpage, '.inlineFree') %>% 
    html_text()                             # Pull title
  views <- html_nodes(webpage, '.count') %>% 
    html_text() %>%
    str_replace_all("[,]", "") %>%
    as.numeric()                            # Pull and format viewcount
  rating <- html_nodes(webpage, '.percent') %>% 
    html_text() %>%
    str_replace("[%]", "") %>%
    as.numeric()                            # Pull and format rating value
  categories <- html_nodes(webpage, '.categoriesWrapper > a') %>%
    html_text() %>%
    str_c(collapse = ", ")                  # Pull official categories
  production <- html_nodes(webpage, '.production') %>% 
    html_text()                             # Pull production type
  tags <- html_nodes(webpage, '.tagsWrapper > a') %>% 
    html_text() %>% 
    str_c(collapse = ", ")                  # Pull and format video tags
  added <- html_nodes(webpage, xpath = '/html/head/meta[9]') %>%
    html_attr(name = "content") %>% 
    str_extract(pattern = trimpatdate) %>% 
    str_replace("/", "")                    # Pull and format dates added
  viewkey <- html_nodes(webpage, xpath = '/html/head/link[4]') %>%  
    html_attr(name = "href") %>% 
    str_replace(pattern = trimpatvk, "")    # Pull unique video identifier
  # Save video data as dataframe in indexed list
  randviddat.list[[i]] = 
    data.frame(title, views, rating, categories, production, tags, added, viewkey)
 }
	# Purpose: scrape metadata from random videos (packages not included)
	URL <- "https://www.pornhub.com/gay/video/random"

	# Initialize data list and vars for storage
	randviddat.list = list()
	i <- 1
	maxvid <- 1000

	# Iterator for scraping
	for(i in 1:maxvid) {
	Sys.sleep(runif(1,0,1)) # Sleep for 0 to 1 seconds
	randurl <- HEAD(URL)[[1]] # Pull web address of random video
	if (status_code(GET(randurl)) != 200) {
	maxvid <- maxvid + 1 # Return if bad link
	next
	}
	webpage <- read_html(randurl) # Read in HTML document for random URL
	# Check if class of premiumLocked exists
	if (length(html_attr(html_nodes(webpage, '.premiumLocked'), name = "class") != 0)) {
	maxvid <- maxvid + 1 # Return if video is behind paywall
	next
	}
	title <- html_nodes(webpage, '.inlineFree') %>%
	html_text() # Pull title
	views <- html_nodes(webpage, '.count') %>%
	html_text() %>%
	str_replace_all("[,]", "") %>%
	as.numeric() # Pull and format viewcount
	rating <- html_nodes(webpage, '.percent') %>%
	html_text() %>%
	str_replace("[%]", "") %>%
	as.numeric() # Pull and format rating value
	categories <- html_nodes(webpage, '.categoriesWrapper > a') %>%
	html_text() %>%
	str_c(collapse = ", ") # Pull official categories
	production <- html_nodes(webpage, '.production') %>%
	html_text() # Pull production type
	tags <- html_nodes(webpage, '.tagsWrapper > a') %>%
	html_text() %>%
	str_c(collapse = ", ") # Pull and format video tags
	added <- html_nodes(webpage, xpath = '/html/head/meta[9]') %>%
	html_attr(name = "content") %>%
	str_extract(pattern = trimpatdate) %>%
	str_replace("/", "") # Pull and format dates added
	viewkey <- html_nodes(webpage, xpath = '/html/head/link[4]') %>%
	html_attr(name = "href") %>%
	str_replace(pattern = trimpatvk, "") # Pull unique video identifier
	# Save video data as dataframe in indexed list
	randviddat.list[[i]] =
	data.frame(title, views, rating, categories, production, tags, added, viewkey)
	}