Last active
May 13, 2018 13:26
-
-
Save mkyt/e3934169bca276cea97a421f9585c802 to your computer and use it in GitHub Desktop.
Extract recipes for cook4me express
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from urllib.request import urlopen | |
import re | |
import json | |
from pprint import pprint | |
from bs4 import BeautifulSoup | |
BASE_URL = 'http://www.club.t-fal.co.jp' | |
LIST_URL = BASE_URL + '/recipe/category/c4m-express/{:d}/' | |
DETAIL_URL = BASE_URL + '/recipe/detail/{:d}/' | |
LIST_PAGES = (1, 11) | |
def get(url): | |
return BeautifulSoup(urlopen(url).read().decode('utf-8'), 'lxml') | |
def extract_detail(soup, id_): | |
# id: int | |
# title: string | |
# cook_duration: int (in minutes) | |
# calorie: int (in kcal) | |
# genre : string | |
# difficulty : string | |
# prep_duration: int (in minutes) | |
# comment: string | |
# ingredients: list of (name: string * amount: string) | |
# yield: int | |
# instructions: list of string | |
# img_url: string | |
print('parsing detail id={}'.format(id_)) | |
r = soup.select_one('div[itemtype="http://data-vocabulary.org/Recipe"]') | |
title = r.select_one('h2[itemprop="name"]').text | |
# assuming one image per recipe | |
img_url = BASE_URL + r.select_one('div#recipe_photo img').attrs['src'] | |
# comment & prep_duration | |
s = r.select_one('p[itemprop="summary"]').text | |
# s:= '根菜をたっぷりとれるヘルシーおかず。\n\n【準備時間:15分】' | |
m = re.search(r'【準備時間:(\d+)分】', s) | |
prep_duration = int(m.group(1)) if m is not None else None | |
comment = s.split('【')[0].strip() | |
cook_duration = int(r.select_one('div#recipe_content ul li.r_time time').text.strip()[:-1]) | |
calorie = int(r.select_one('div#recipe_content ul li.cal').text.split(':')[-1][:-4]) | |
genre = r.select_one('div#recipe_content ul li.genre').text.split(':')[-1].strip() | |
difficulty = r.select_one('div#recipe_content ul li.level').text.split(':')[-1].strip() | |
instructions = [elem.text.strip() for elem in r.select('div#recipe_howto ul li')] | |
m = re.match(r'\((\d+)人分', r.select_one('div[itemprop="ingredient"] span[itemprop="yield"]').text) | |
yield_ = int(m.group(1)) if m is not None else None | |
ing_elem = r.select_one('div[itemprop="ingredient"] dl') | |
# special treatment for only_dt (where amount is omitted) | |
for elem in ing_elem.select('dt.only_dt'): | |
if ')' in elem.text: # fix for ID: 1525 | |
name, amt = elem.text.split(')') | |
elem.string = name + ')' | |
else: | |
name, amt = elem.text, '' | |
dd = soup.new_tag('dd') | |
dd.string = amt | |
elem.insert_after(dd) | |
ingredients = [{'name': name.text.strip(), 'amount': amt.text.strip()} for name, amt in zip(ing_elem.select('dt'), ing_elem.select('dd'))] | |
for ingredient in ingredients: | |
name = ingredient['name'] | |
marking = None | |
if '-' in name: | |
name, marking = name.split('-') | |
ingredient['name'] = name.strip() | |
ingredient['marking'] = marking.strip() | |
if name.count('(') == 1: | |
name, detail = name.split('(') | |
if not detail.endswith(')'): | |
print(ingredient['name']) | |
import pdb; pdb.set_trace() | |
detail = detail[:-1] # remove closing paren | |
ingredient['name'] = name.strip() | |
ingredient['detail'] = detail.strip() | |
elif '(' in ingredient: | |
print(ingredient['name']) | |
import pdb; pdb.set_trace() | |
return { | |
'id': id_, | |
'title': title, | |
'cook_duration': cook_duration, | |
'prep_duration': prep_duration, | |
'img_url': img_url, | |
'comment': comment, | |
'calorie': calorie, | |
'genre': genre, | |
'difficulty': difficulty, | |
'instructions': instructions, | |
'yield': yield_, | |
'ingredients': ingredients | |
} | |
def extract_ids(soup): | |
res = [] | |
for elem in soup.select('div.recipe_item p.text a'): | |
# elem := "<a href="/recipe/detail/1469/">野菜の肉巻き</a>" | |
res.append(int(elem.attrs['href'].split('/')[-2])) | |
return res | |
def obtain_detail(id_): | |
soup = get(DETAIL_URL.format(id_)) | |
return extract_detail(soup, id_) | |
def obtain_all_ids(): | |
res = [] | |
for page in range(*LIST_PAGES): | |
soup = get(LIST_URL.format(page)) | |
res += extract_ids(soup) | |
return sorted(res) | |
if __name__ == '__main__': | |
recipes = [] | |
ids = obtain_all_ids() | |
for id_ in ids: | |
detail = obtain_detail(id_) | |
pprint(detail) | |
recipes.append(detail) | |
json.dump(recipes, open('recipes.json', 'w'), ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment