Skip to content

Instantly share code, notes, and snippets.

@leekiernan
Created March 9, 2018 15:48
Show Gist options
  • Save leekiernan/bb8f738c8cb609e693d1e63713482d3b to your computer and use it in GitHub Desktop.
Save leekiernan/bb8f738c8cb609e693d1e63713482d3b to your computer and use it in GitHub Desktop.
# 404: https://www.whsmith.co.uk/dept/toys-and-games-toys-model-railway-14x00010
# 200: https://www.whsmith.co.uk/dept/books-biography-and-true-stories-humour-02x02589
from bs4 import BeautifulSoup
import requests
import re
import csv
import sys
import os
FILE_TO_OPEN = './urls.txt'
# with open() handles closing files correctly. creates cvs and open in write mode.
with open('levels.csv', 'w') as csvfile:
# Set up csv writer - write header row.
fieldnames = ['URL', 'level']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# Read in the urls file - read only.
with open(FILE_TO_OPEN, 'r') as urls:
# guesstimate lines from size devided by rough line width. rounded...
lines_in_file = int(os.path.getsize(FILE_TO_OPEN) / 93)
total_requests = 0
# Read one line at a time.
for line in urls:
url = line.rstrip() # Strip newline characters from end of lines.
req = requests.get(url)
# If 404 we shouldn't try to read the HTML.
if req.status_code == 404:
# Write line to csv
writer.writerow({'URL': line, 'level': '-1'})
# Continue breaks this loop and starts again with next line/url
continue
soup = BeautifulSoup(req.text, 'html.parser')
# Try/except prevent breaking errors
try:
# beautifulsoup has a different syntax for looking for things.
# Python lets you take take from an array starting at the end [-1:] = last 1 item. [start:stop]
level = soup.find_all("ul", class_="hierarchy_level")[-1:]
# Grabbing the selected level, with a capture group around the level number
capture = re.search('level_(\d+).*?selected', str(level))
# Write this row to csv
writer.writerow({ 'URL':line, 'level':capture.group(1) })
# We're looking for issue with the last hierarchy level not being selected;
# https://www.whsmith.co.uk/dept/books-computing-and-technology-enterprise-software-02x01112
# https://www.whsmith.co.uk/dept/entertainment-music-jazz-06x00021
except AttributeError:
# If we get this error, we attempt again with the second from last hierarchy_level
try:
# second to last item [-2:-1]
level = soup.find_all("ul", class_="hierarchy_level")[-2:-1]
capture = re.search('level_(\d+).*?selected', str(level))
writer.writerow({ 'URL':line, 'level':capture.group(1) })
except AttributeError:
# Do nothing if still errors. Print to console for investigation;
# https://www.whsmith.co.uk/dept/books-childrens-02x00001
# https://www.whsmith.co.uk/dept/books-cookery-food-and-drink-02x00012
# https://www.whsmith.co.uk/books
print("Problem @ {0}".format(url))
continue
# You can kind of ignore this - it's for showing (rough) progress
# [====================] 100.00% - 210/200
total_requests += 1
percent = total_requests / lines_in_file
if percent > 100: percent = 100
sys.stdout.write('\r')
sys.stdout.write("[{}] {}% - {}/{} ".format(
('=' * int(percent / 5)).ljust(20),
"{0:.{1}f}".format(percent,2),
total_requests,
lines_in_file)
)
sys.stdout.flush()
# /progress bar.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment