leekiernan · March 9, 2018 15:48
diff --git a/scraper.py b/scraper.py
 # 404: https://www.whsmith.co.uk/dept/toys-and-games-toys-model-railway-14x00010
 # 200: https://www.whsmith.co.uk/dept/books-biography-and-true-stories-humour-02x02589

 from bs4 import BeautifulSoup
 import requests
 import re
 import csv
 import sys
 import os

 FILE_TO_OPEN = './urls.txt'

 # with open() handles closing files correctly. creates cvs and open in write mode.
 with open('levels.csv', 'w') as csvfile:
 	# Set up csv writer - write header row.
 	fieldnames = ['URL', 'level']
 	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 	writer.writeheader()

 	# Read in the urls file - read only.
 	with open(FILE_TO_OPEN, 'r') as urls:
 		# guesstimate lines from size devided by rough line width. rounded...
 		lines_in_file = int(os.path.getsize(FILE_TO_OPEN) / 93)
 		total_requests = 0

 		# Read one line at a time.
 		for line in urls:
 			url = line.rstrip() # Strip newline characters from end of lines.
 			req = requests.get(url)

 			# If 404 we shouldn't try to read the HTML.
 			if req.status_code == 404:
 				# Write line to csv
 				writer.writerow({'URL': line, 'level': '-1'})
 				# Continue breaks this loop and starts again with next line/url
 				continue

 			soup = BeautifulSoup(req.text, 'html.parser')

 			# Try/except prevent breaking errors
 			try:
 				# beautifulsoup has a different syntax for looking for things.
 				# Python lets you take take from an array starting at the end [-1:] = last 1 item. [start:stop]
 				level = soup.find_all("ul", class_="hierarchy_level")[-1:]
 				# Grabbing the selected level, with a capture group around the level number
 				capture = re.search('level_(\d+).*?selected', str(level))
 				# Write this row to csv
 				writer.writerow({ 'URL':line, 'level':capture.group(1) })

 			# We're looking for issue with the last hierarchy level not being selected;
 			# https://www.whsmith.co.uk/dept/books-computing-and-technology-enterprise-software-02x01112
 			# https://www.whsmith.co.uk/dept/entertainment-music-jazz-06x00021
 			except AttributeError:
 				# If we get this error, we attempt again with the second from last hierarchy_level
 				try:
 					# second to last item [-2:-1]
 					level = soup.find_all("ul", class_="hierarchy_level")[-2:-1]
 					capture = re.search('level_(\d+).*?selected', str(level))
 					writer.writerow({ 'URL':line, 'level':capture.group(1) })
 				except AttributeError:
 					# Do nothing if still errors. Print to console for investigation;
 					# https://www.whsmith.co.uk/dept/books-childrens-02x00001
 					# https://www.whsmith.co.uk/dept/books-cookery-food-and-drink-02x00012
 					# https://www.whsmith.co.uk/books
 					print("Problem @ {0}".format(url))
 					continue

 			# You can kind of ignore this - it's for showing (rough) progress
 			# [====================] 100.00% - 210/200
 			total_requests += 1
 			percent = total_requests / lines_in_file
 			if percent > 100: percent = 100

 			sys.stdout.write('\r')
 			sys.stdout.write("[{}] {}% - {}/{} ".format(
 				('=' * int(percent / 5)).ljust(20),
 				"{0:.{1}f}".format(percent,2),
 				total_requests,
 				lines_in_file)
 			)
 			sys.stdout.flush()
 			# /progress bar.
	# 404: https://www.whsmith.co.uk/dept/toys-and-games-toys-model-railway-14x00010
	# 200: https://www.whsmith.co.uk/dept/books-biography-and-true-stories-humour-02x02589

	from bs4 import BeautifulSoup
	import requests
	import re
	import csv
	import sys
	import os

	FILE_TO_OPEN = './urls.txt'

	# with open() handles closing files correctly. creates cvs and open in write mode.
	with open('levels.csv', 'w') as csvfile:
	# Set up csv writer - write header row.
	fieldnames = ['URL', 'level']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()

	# Read in the urls file - read only.
	with open(FILE_TO_OPEN, 'r') as urls:
	# guesstimate lines from size devided by rough line width. rounded...
	lines_in_file = int(os.path.getsize(FILE_TO_OPEN) / 93)
	total_requests = 0

	# Read one line at a time.
	for line in urls:
	url = line.rstrip() # Strip newline characters from end of lines.
	req = requests.get(url)

	# If 404 we shouldn't try to read the HTML.
	if req.status_code == 404:
	# Write line to csv
	writer.writerow({'URL': line, 'level': '-1'})
	# Continue breaks this loop and starts again with next line/url
	continue

	soup = BeautifulSoup(req.text, 'html.parser')

	# Try/except prevent breaking errors
	try:
	# beautifulsoup has a different syntax for looking for things.
	# Python lets you take take from an array starting at the end [-1:] = last 1 item. [start:stop]
	level = soup.find_all("ul", class_="hierarchy_level")[-1:]
	# Grabbing the selected level, with a capture group around the level number
	capture = re.search('level_(\d+).*?selected', str(level))
	# Write this row to csv
	writer.writerow({ 'URL':line, 'level':capture.group(1) })

	# We're looking for issue with the last hierarchy level not being selected;
	# https://www.whsmith.co.uk/dept/books-computing-and-technology-enterprise-software-02x01112
	# https://www.whsmith.co.uk/dept/entertainment-music-jazz-06x00021
	except AttributeError:
	# If we get this error, we attempt again with the second from last hierarchy_level
	try:
	# second to last item [-2:-1]
	level = soup.find_all("ul", class_="hierarchy_level")[-2:-1]
	capture = re.search('level_(\d+).*?selected', str(level))
	writer.writerow({ 'URL':line, 'level':capture.group(1) })
	except AttributeError:
	# Do nothing if still errors. Print to console for investigation;
	# https://www.whsmith.co.uk/dept/books-childrens-02x00001
	# https://www.whsmith.co.uk/dept/books-cookery-food-and-drink-02x00012
	# https://www.whsmith.co.uk/books
	print("Problem @ {0}".format(url))
	continue

	# You can kind of ignore this - it's for showing (rough) progress
	# [====================] 100.00% - 210/200
	total_requests += 1
	percent = total_requests / lines_in_file
	if percent > 100: percent = 100

	sys.stdout.write('\r')
	sys.stdout.write("[{}] {}% - {}/{} ".format(
	('=' * int(percent / 5)).ljust(20),
	"{0:.{1}f}".format(percent,2),
	total_requests,
	lines_in_file)
	)
	sys.stdout.flush()
	# /progress bar.