DanNi0130’s gists

DanNi0130 / requestsscrapeidregex

Created February 24, 2019 06:59

	IDsearch = re.compile(r'id=(\d+)') # This searches for anything that starts with ‘id=’ and ends with a string of numbers, capturing the string of numbers
	threadIDs = IDsearch.findall(str(cleanpagedata)) # We need to convert the BeautifulSoup output to a string in order to search with regex

DanNi0130 / requestscrapeidregex2

Created February 24, 2019 07:09

	IDsearch = re.compile(r'vote\?id=(\d+)&amp') # don’t forget the \ before the ? in the regular expression - certain characters, such as the ? are special in regex and thus need to have an escape character otherwise it will count as part of the regex search
	threadIDs = IDsearch.findall(str(cleanpagedata))

DanNi0130 / requestsscrapecommentslinks

Created February 24, 2019 07:10

	commentlinks = []
	for i in range(len(threadIDs)):
	commentlinks.append("https://news.ycombinator.com/item?id=" + threadIDs[i])

DanNi0130 / requestsscrapethread

Created February 24, 2019 07:11

	thread = requests.get(commentlinks[0])
	cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')

DanNi0130 / link scrape

Created February 24, 2019 07:12

<a class="storylink" href="https://babluboy.github.io/bookworm/">Bookworm: A Simple, Focused eBook Reader</a>

DanNi0130 / requestsscrapethreadregex

Created February 24, 2019 07:14

	singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>') # again, don’t forget the escape \ before characters like < and >
	singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))

DanNi0130 / requestsscrapecommenterregex

Created February 24, 2019 07:15

	commenterIDsearch = re.compile(r'user\?id=(.+?)"')
	commenterIDs = commenterIDsearch.findall(str(cleanthread))

DanNi0130 / requestsscrapefirstcommenter

Created February 24, 2019 07:16

firstcommenter = commenterIDs[1] # Remember that Python lists start with 0

DanNi0130 / requestsscrapethread

Created February 24, 2019 07:17

	def scrapethread(cleanthread): # We need to feed the thread data into the function
	singlethreadlinksearch = re.compile(r'\<a class="storylink" href="(.+?)"\>')
	singlethreadlink = singlethreadlinksearch.findall(str(cleanthread))
	commenterIDsearch = re.compile(r'user\?id=(.+?)"')
	commenterIDs = commenterIDsearch.findall(str(cleanthread))
	try:
	firstcommenter = commenterIDs[1] # If there are no commenters this will fail, so we wrap it in a try/except just in case
	except:
	firstcommenter = "No commenters"
	return singlethreadlink, firstcommenter # Return the variables

DanNi0130 / requestsscrapeloop

Created February 24, 2019 07:18

	results = [] # We want our results to come back as a list
	for i in range(len(commentlinks)):
	thread = requests.get(commentlinks[i]) # Go to each link
	cleanthread = bs4.BeautifulSoup(thread.text, 'html.parser')
	link, commenter = scrapethread(cleanthread) # Scrape the data and return them to these variables
	results.append(link + [commenter]) # Append the results - note that the link actually returns as a list, rather than a string
	time.sleep(30)