Skip to content

Instantly share code, notes, and snippets.

jsIDsearch = re.compile(r'\<a href="#\/item\/(\d+)"') # Don’t forget the \ before the /’s and <’s!
jsthreadIDs = jsIDsearch.findall(str(jspagedata))
jscommentlinks = []
for i in range(len(jsthreadIDs)):
jscommentlinks.append('https://vuejs.github.io/vue-hackernews/#/item/' + jsthreadIDs[i])
jspagedata = bs4.BeautifulSoup(driver.page_source, 'html.parser')
driver = webdriver.Chrome()
driver.get("https://vuejs.github.io/vue-hackernews/#!/news/1")
from selenium import webdriver
import requests, bs4, re, time
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Vue.js HN Clone</title>
<meta content="initial-scale=1, maximum-scale=1, user-scalable=no, minimal-ui" name="viewport"/>
<link href="static/logo.png" rel="icon" type="image/x-icon"/>
</head>
<body>
<div id="app"></div>
<script src="static/build.js"></script>
jspagedata = requests.get("https://vuejs.github.io/vue-hackernews/#!/news/1")
jspagedataclean = bs4.BeautifulSoup(jspagedata.text, 'html.parser')
results = [] # We want our results to come back as a list
for i in range(len(commentlinks)):
thread = requests.get(commentlinks[i]) # Go to each link
cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')
link, commenter = scrapethread(cleanthread) # Scrape the data and return them to these variables
results.append(link + [commenter]) # Append the results - note that the link actually returns as a list, rather than a string
time.sleep(30)
def scrapethread(cleanthread): # We need to feed the thread data into the function
singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>')
singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))
commenterIDsearch = re.compile(r'user\?id=(.+?)"')
commenterIDs = commenterIDsearch.findall(str(cleanthread))
try:
firstcommenter = commenterIDs[1] # If there are no commenters this will fail, so we wrap it in a try/except just in case
except:
firstcommenter = "No commenters"
return singlethreadlink, firstcommenter # Return the variables
firstcommenter = commenterIDs[1] # Remember that Python lists start with 0
commenterIDsearch = re.compile(r'user\?id=(.+?)"')
commenterIDs = commenterIDsearch.findall(str(cleanthread))