Skip to content

Instantly share code, notes, and snippets.

IDsearch = re.compile(r'id=(\d+)') # This searches for anything that starts with ‘id=’ and ends with a string of numbers, capturing the string of numbers
threadIDs = IDsearch.findall(str(cleanpagedata)) # We need to convert the BeautifulSoup output to a string in order to search with regex
IDsearch = re.compile(r'vote\?id=(\d+)&amp') # don’t forget the \ before the ? in the regular expression - certain characters, such as the ? are special in regex and thus need to have an escape character otherwise it will count as part of the regex search
threadIDs = IDsearch.findall(str(cleanpagedata))
thread = requests.get(commentlinks[0])
cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')
singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>') # again, don’t forget the escape \ before characters like < and >
singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))
commenterIDsearch = re.compile(r'user\?id=(.+?)"')
commenterIDs = commenterIDsearch.findall(str(cleanthread))
firstcommenter = commenterIDs[1] # Remember that Python lists start with 0
def scrapethread(cleanthread): # We need to feed the thread data into the function
singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>')
singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))
commenterIDsearch = re.compile(r'user\?id=(.+?)"')
commenterIDs = commenterIDsearch.findall(str(cleanthread))
try:
firstcommenter = commenterIDs[1] # If there are no commenters this will fail, so we wrap it in a try/except just in case
except:
firstcommenter = "No commenters"
return singlethreadlink, firstcommenter # Return the variables
results = [] # We want our results to come back as a list
for i in range(len(commentlinks)):
thread = requests.get(commentlinks[i]) # Go to each link
cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')
link, commenter = scrapethread(cleanthread) # Scrape the data and return them to these variables
results.append(link + [commenter]) # Append the results - note that the link actually returns as a list, rather than a string
time.sleep(30)