DanNi0130’s gists

DanNi0130 / seleniumcommentlist

Created February 25, 2019 08:50

	jsIDsearch = re.compile(r'\<a href="#\/item\/(\d+)"') # Don’t forget the \ before the /’s and <’s!
	jsthreadIDs = jsIDsearch.findall(str(jspagedata))

	jscommentlinks = []
	for i in range(len(jsthreadIDs)):
	jscommentlinks.append('https://vuejs.github.io/vue-hackernews/#/item/' + jsthreadIDs[i])

DanNi0130 / seleniumpagedata

Created February 25, 2019 08:49

jspagedata = bs4.BeautifulSoup(driver.page_source, 'html.parser')

DanNi0130 / seleniumlaunch

Created February 25, 2019 08:48

	driver = webdriver.Chrome()
	driver.get("https://vuejs.github.io/vue-hackernews/#!/news/1")

DanNi0130 / seleniumimport

Created February 25, 2019 08:47

	from selenium import webdriver
	import requests, bs4, re, time

DanNi0130 / jsscraperequestsresults

Created February 25, 2019 08:46

	<html lang="en">
	<head>
	<meta charset="utf-8"/>
	<title>Vue.js HN Clone</title>
	<meta content="initial-scale=1, maximum-scale=1, user-scalable=no, minimal-ui" name="viewport"/>
	<link href="static/logo.png" rel="icon" type="image/x-icon"/>
	</head>
	<body>
	<div id="app"></div>
	<script src="static/build.js"></script>

DanNi0130 / jsscraperequests

Created February 25, 2019 08:44

	jspagedata = requests.get("https://vuejs.github.io/vue-hackernews/#!/news/1")
	jspagedataclean = bs4.BeautifulSoup(jspagedata.text, 'html.parser')

DanNi0130 / requestsscrapeloop

Created February 24, 2019 07:18

	results = [] # We want our results to come back as a list
	for i in range(len(commentlinks)):
	thread = requests.get(commentlinks[i]) # Go to each link
	cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')
	link, commenter = scrapethread(cleanthread) # Scrape the data and return them to these variables
	results.append(link + [commenter]) # Append the results - note that the link actually returns as a list, rather than a string
	time.sleep(30)

DanNi0130 / requestsscrapethread

Created February 24, 2019 07:17

	def scrapethread(cleanthread): # We need to feed the thread data into the function
	singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>')
	singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))
	commenterIDsearch = re.compile(r'user\?id=(.+?)"')
	commenterIDs = commenterIDsearch.findall(str(cleanthread))
	try:
	firstcommenter = commenterIDs[1] # If there are no commenters this will fail, so we wrap it in a try/except just in case
	except:
	firstcommenter = "No commenters"
	return singlethreadlink, firstcommenter # Return the variables

DanNi0130 / requestsscrapefirstcommenter

Created February 24, 2019 07:16

firstcommenter = commenterIDs[1] # Remember that Python lists start with 0

DanNi0130 / requestsscrapecommenterregex

Created February 24, 2019 07:15

	commenterIDsearch = re.compile(r'user\?id=(.+?)"')
	commenterIDs = commenterIDsearch.findall(str(cleanthread))