Skip to content

Instantly share code, notes, and snippets.

singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>') # again, don’t forget the escape \ before characters like < and >
singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))
thread = requests.get(commentlinks[0])
cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')
IDsearch = re.compile(r'vote\?id=(\d+)&amp') # don’t forget the \ before the ? in the regular expression - certain characters, such as the ? are special in regex and thus need to have an escape character otherwise it will count as part of the regex search
threadIDs = IDsearch.findall(str(cleanpagedata))
IDsearch = re.compile(r'id=(\d+)') # This searches for anything that starts with ‘id=’ and ends with a string of numbers, capturing the string of numbers
threadIDs = IDsearch.findall(str(cleanpagedata)) # We need to convert the BeautifulSoup output to a string in order to search with regex
cleanpagedata = bs4.BeautifulSoup(pagedata.text, 'html.parser')
<html op="news"><head><meta content="origin" name="referrer"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><link href="news.css?Swdnfjd2lvQXPAqH2Hs6" rel="stylesheet" type="text/css"/>
<link href="favicon.ico" rel="shortcut icon"/>
<link href="rss" rel="alternate" title="RSS" type="application/rss+xml"/>
<title>Hacker News</title></head><body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.gif" style="border:1px white solid;" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">
b'<html op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?Swdnfjd2lvQXPAqH2Hs6">\n <link rel="shortcut icon" href="favicon.ico">\n <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">\n <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">\n <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>\n <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>\n <a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments
import requests, bs4, re, time