Created
March 9, 2018 15:48
-
-
Save leekiernan/bb8f738c8cb609e693d1e63713482d3b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 404: https://www.whsmith.co.uk/dept/toys-and-games-toys-model-railway-14x00010 | |
# 200: https://www.whsmith.co.uk/dept/books-biography-and-true-stories-humour-02x02589 | |
from bs4 import BeautifulSoup | |
import requests | |
import re | |
import csv | |
import sys | |
import os | |
FILE_TO_OPEN = './urls.txt' | |
# with open() handles closing files correctly. creates cvs and open in write mode. | |
with open('levels.csv', 'w') as csvfile: | |
# Set up csv writer - write header row. | |
fieldnames = ['URL', 'level'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
# Read in the urls file - read only. | |
with open(FILE_TO_OPEN, 'r') as urls: | |
# guesstimate lines from size devided by rough line width. rounded... | |
lines_in_file = int(os.path.getsize(FILE_TO_OPEN) / 93) | |
total_requests = 0 | |
# Read one line at a time. | |
for line in urls: | |
url = line.rstrip() # Strip newline characters from end of lines. | |
req = requests.get(url) | |
# If 404 we shouldn't try to read the HTML. | |
if req.status_code == 404: | |
# Write line to csv | |
writer.writerow({'URL': line, 'level': '-1'}) | |
# Continue breaks this loop and starts again with next line/url | |
continue | |
soup = BeautifulSoup(req.text, 'html.parser') | |
# Try/except prevent breaking errors | |
try: | |
# beautifulsoup has a different syntax for looking for things. | |
# Python lets you take take from an array starting at the end [-1:] = last 1 item. [start:stop] | |
level = soup.find_all("ul", class_="hierarchy_level")[-1:] | |
# Grabbing the selected level, with a capture group around the level number | |
capture = re.search('level_(\d+).*?selected', str(level)) | |
# Write this row to csv | |
writer.writerow({ 'URL':line, 'level':capture.group(1) }) | |
# We're looking for issue with the last hierarchy level not being selected; | |
# https://www.whsmith.co.uk/dept/books-computing-and-technology-enterprise-software-02x01112 | |
# https://www.whsmith.co.uk/dept/entertainment-music-jazz-06x00021 | |
except AttributeError: | |
# If we get this error, we attempt again with the second from last hierarchy_level | |
try: | |
# second to last item [-2:-1] | |
level = soup.find_all("ul", class_="hierarchy_level")[-2:-1] | |
capture = re.search('level_(\d+).*?selected', str(level)) | |
writer.writerow({ 'URL':line, 'level':capture.group(1) }) | |
except AttributeError: | |
# Do nothing if still errors. Print to console for investigation; | |
# https://www.whsmith.co.uk/dept/books-childrens-02x00001 | |
# https://www.whsmith.co.uk/dept/books-cookery-food-and-drink-02x00012 | |
# https://www.whsmith.co.uk/books | |
print("Problem @ {0}".format(url)) | |
continue | |
# You can kind of ignore this - it's for showing (rough) progress | |
# [====================] 100.00% - 210/200 | |
total_requests += 1 | |
percent = total_requests / lines_in_file | |
if percent > 100: percent = 100 | |
sys.stdout.write('\r') | |
sys.stdout.write("[{}] {}% - {}/{} ".format( | |
('=' * int(percent / 5)).ljust(20), | |
"{0:.{1}f}".format(percent,2), | |
total_requests, | |
lines_in_file) | |
) | |
sys.stdout.flush() | |
# /progress bar. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment