Skip to content

Instantly share code, notes, and snippets.

jspagedata = requests.get("https://vuejs.github.io/vue-hackernews/#!/news/1")
jspagedataclean = bs4.BeautifulSoup(jspagedata.text, 'html.parser')
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Vue.js HN Clone</title>
<meta content="initial-scale=1, maximum-scale=1, user-scalable=no, minimal-ui" name="viewport"/>
<link href="static/logo.png" rel="icon" type="image/x-icon"/>
</head>
<body>
<div id="app"></div>
<script src="static/build.js"></script>
from selenium import webdriver
import requests, bs4, re, time
driver = webdriver.Chrome()
driver.get("https://vuejs.github.io/vue-hackernews/#!/news/1")
jspagedata = bs4.BeautifulSoup(driver.page_source, 'html.parser')
jsIDsearch = re.compile(r'\<a href="#\/item\/(\d+)"') # Don’t forget the \ before the /’s and <’s!
jsthreadIDs = jsIDsearch.findall(str(jspagedata))
jscommentlinks = []
for i in range(len(jsthreadIDs)):
jscommentlinks.append('https://vuejs.github.io/vue-hackernews/#/item/' + jsthreadIDs[i])
def jsscrapethread(jscleanthread):
jssinglethreadlinksearch = re.compile(r'\<a class="title" href="(.+?)"')
jssinglethreadlink = jssinglethreadlinksearch.findall(str(jscleanthread))
jscommenterIDsearch = re.compile(r'#\/user\/(.+?)"')
jscommenterIDs = jscommenterIDsearch.findall(str(jscleanthread))
try:
jsfirstcommenter = jscommenterIDs[1]
except:
jsfirstcommenter = "No Commenters"
return jssinglethreadlink, jsfirstcommenter
>> pagedata
<Response [200]>
threadIDs = {list} ['19279396', '19279396', '19279396', '19279396', '19277809', '19277809', '19277809', '19277809', '19279003', '19279003', '19279003', '19279003', '19278075', '19278075', '19278075', '19278075', '19273955', '19273955', '19273955', '19273955', '19278555', '19
000 = {str} '19279396'
001 = {str} '19279396'
002 = {str} '19279396'
003 = {str} '19279396'
004 = {str} '19277809'
005 = {str} '19277809'
006 = {str} '19277809'
007 = {str} '19277809'
008 = {str} '19279003'
threadIDs = {list} ['19279396', '19277809', '19279003', '19278075', '19273955', '19278555', '19278936', '19274941', '19277846', '19277653', '19274406', '19278891', '19278302', '19276113', '19277263', '19276977', '19277978', '19275755', '19276751', '19277910', '19275738', '19
00 = {str} '19279396'
01 = {str} '19277809'
02 = {str} '19279003'
03 = {str} '19278075'
04 = {str} '19273955'
05 = {str} '19278555'
06 = {str} '19278936'
07 = {str} '19274941'
08 = {str} '19277846'