Created
May 17, 2016 15:31
-
-
Save Xiol/6835eebf21e1c5bd54b831c6dc014923 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Given a load of HTML pages from the BBC Food website, this script will | |
# scrape the recipes from the HTML and output them in structured JSON format. | |
import argparse | |
import sys | |
import os | |
import json | |
import base64 | |
from bs4 import BeautifulSoup | |
def log(msg): | |
sys.stderr.write(msg+"\n") | |
def fatal(msg): | |
log(msg) | |
sys.exit(1) | |
class FoodParser(): | |
def __init__(self, path): | |
log("Parsing {}".format(path)) | |
self.soup = self.soup_from_file(path) | |
self.path = path | |
def parse(self): | |
jout = { | |
"title": self.get_title(), | |
"author": self.get_author(), | |
"from": self.get_from(), | |
"prep_time": self.get_prep_time(), | |
"cook_time": self.get_cooking_time(), | |
"servings": self.get_servings(), | |
"recommendations": self.get_recommendations(), | |
"description": self.get_description(), | |
"ingredients": self.get_ingredients(), | |
"method": self.get_method(), | |
"tips": self.get_tips(), | |
"dietary": self.get_dietary(), | |
"url": self.get_original_url(), | |
"image": self.get_image(), | |
} | |
return jout | |
def get_title(self): | |
try: | |
return self.soup.find("h1", class_='content-title__text').text | |
except AttributeError: | |
return None | |
def get_prep_time(self): | |
try: | |
return self.soup.find("p", class_='recipe-metadata__prep-time').text | |
except AttributeError: | |
return None | |
def get_cooking_time(self): | |
try: | |
return self.soup.find("p", class_='recipe-metadata__cook-time').text | |
except AttributeError: | |
return None | |
def get_servings(self): | |
try: | |
servings = self.soup.find("p", class_="recipe-metadata__serving").text | |
except AttributeError: | |
return None | |
if servings.startswith("Serves"): | |
try: | |
servings = int(servings.split(" ")[1]) | |
except ValueError: | |
pass | |
return servings | |
def get_recommendations(self): | |
try: | |
recommendations = self.soup.find("p", class_="recipe-metadata__recommendations").text | |
except AttributeError: | |
return -1 | |
if len(recommendations.split(" ")) > 0: | |
try: | |
return int(recommendations.split(" ")[0]) | |
except AttributeError: | |
return recommendations | |
def get_author(self): | |
try: | |
return self.soup.find(class_="chef").find(class_="chef__name").find(class_="chef__preposition", string="By").find_next_sibling("a").text | |
except AttributeError: | |
return "Unknown" | |
def get_from(self): | |
try: | |
return self.soup.find(class_="chef").find(class_="chef__programme-name").find(class_="chef__preposition", string="From").find_next_sibling("a").text | |
except AttributeError: | |
return None | |
def get_description(self): | |
try: | |
return self.soup.find("p", class_="recipe-description__text").text.strip() | |
except AttributeError: | |
return None | |
def get_ingredients(self): | |
ingredients = [] | |
for ingredient in self.soup.find("ul", class_="recipe-ingredients__list").find_all("li"): | |
ingredients.append(ingredient.text.strip()) | |
return ingredients | |
def get_method(self): | |
method = [] | |
for step in self.soup.find("ol", class_="recipe-method__list").find_all("li"): | |
method.append(step.find("p").text.strip()) | |
return method | |
def get_tips(self): | |
try: | |
return self.soup.find("p", class_="recipe-tips__text").text.strip() | |
except AttributeError: | |
return None | |
def get_dietary(self): | |
try: | |
return self.soup.find("div", class_="recipe-metadata__dietary").text.strip() | |
except AttributeError: | |
return None | |
def get_original_url(self): | |
# May not work all the time, but was fine for all the ones I tested on. | |
fname = os.path.basename(self.path) | |
fname = fname.replace('.html', '') | |
return "http://www.bbc.co.uk/food/recipes/{}".format(fname) | |
def get_image(self): | |
try: | |
imgsrc = self.soup.find("div", class_="recipe-media") | |
if not imgsrc: | |
return None | |
fname = imgsrc.find("img")['src'] | |
fpath = os.path.join(os.path.dirname(self.path), fname) | |
img = None | |
try: | |
with open(fpath, 'rb') as fh: | |
img = fh.read() | |
except (OSError, IOError) as e: | |
log("Could not open image at path: {}".format(fpath)) | |
return None | |
return base64.b64encode(img).decode('utf-8') | |
except AttributeError: | |
return None | |
def soup_from_file(self, path): | |
with open(path, 'rb') as fh: | |
fc = fh.read() | |
return BeautifulSoup(fc, 'html.parser') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Parses recipes from the BBC Food website") | |
parser.add_argument("-o", "--parse-one", type=str, help="Parse single HTML file") | |
args = parser.parse_args() | |
if args.parse_one: | |
try: | |
fp = FoodParser(args.parse_one) | |
out = fp.parse() | |
print(json.dumps(out)) | |
except (OSError, IOError) as e: | |
fatal("Failed to open file: {}".format(e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment