Skip to content

Instantly share code, notes, and snippets.

@dave-shawley
Created May 11, 2020 11:15
Show Gist options
  • Save dave-shawley/a46146d954d65763ba0a9cd4c669045b to your computer and use it in GitHub Desktop.
Save dave-shawley/a46146d954d65763ba0a9cd4c669045b to your computer and use it in GitHub Desktop.
Quick & dirty scraper for recipes from Milk Street
#!/usr/bin/env python
#
# Requirements: requests, beautifulsoup4
#
import argparse
import dataclasses
import typing
import bs4
import requests
@dataclasses.dataclass
class Ingredient:
quantity: str
label: str
@dataclasses.dataclass
class Recipe:
title: str
url: str
ingredients: typing.List[Ingredient] = dataclasses.field(
default_factory=list)
instructions: typing.List[str] = dataclasses.field(default_factory=list)
parser = argparse.ArgumentParser()
parser.add_argument('url', metavar='URL', type=str)
parser.add_argument('output', metavar='OUTPUT', type=argparse.FileType('w'))
opts = parser.parse_args()
rsp = requests.get(opts.url)
rsp.raise_for_status()
soup = bs4.BeautifulSoup(rsp.text, features='html.parser')
content = soup.find(class_='recipe__text__content')
recipe = Recipe(
title=soup.find('h1', class_='recipe-header__title').text,
url=opts.url,
)
parent = content.find('ul', class_='ingredients-list')
for ingredient in parent.find_all('li', class_='ingredient'):
recipe.ingredients.append(
Ingredient(ingredient.find(class_='ingredient__quantity').text,
ingredient.find(class_='ingredient__label').text))
parent = content.find('ol', class_='recipe__directions__list')
for direction in parent.find_all('div', class_='recipe__direction__text'):
recipe.instructions.append(direction.text)
opts.output.write('<html><head><title>')
opts.output.write(recipe.title)
opts.output.write('</title><meta charset=utf-8></head><body>')
opts.output.write('<h1>')
opts.output.write(recipe.title)
opts.output.write('</h1>')
opts.output.write('<h2>Ingredients</h2><table>')
for ingredient in recipe.ingredients:
opts.output.write('<tr><td>')
opts.output.write(ingredient.quantity)
opts.output.write('</td><td>')
opts.output.write(ingredient.label)
opts.output.write('</td></tr>')
opts.output.write('</table><h2>Directions</h2><ol>')
for instruction in recipe.instructions:
opts.output.write('<li>')
opts.output.write(instruction)
opts.output.write('</li>')
opts.output.write('</ol><p><i>Extracted from ')
opts.output.write(opts.url)
opts.output.write('</i></p></body></html>')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment