mkyt · May 13, 2018 13:26
diff --git a/cook4me_recipe.py b/cook4me_recipe.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 from urllib.request import urlopen
 import re
 import json
 from pprint import pprint
 from bs4 import BeautifulSoup

 BASE_URL = 'http://www.club.t-fal.co.jp'
 LIST_URL = BASE_URL + '/recipe/category/c4m-express/{:d}/'
 DETAIL_URL = BASE_URL + '/recipe/detail/{:d}/'
 LIST_PAGES = (1, 11)


 def get(url):
    return BeautifulSoup(urlopen(url).read().decode('utf-8'), 'lxml')


 def extract_detail(soup, id_):
    # id: int
    # title: string
    # cook_duration: int (in minutes)
    # calorie: int (in kcal)
    # genre : string
    # difficulty : string
    # prep_duration: int (in minutes)
    # comment: string
    # ingredients: list of (name: string  * amount: string)
    # yield: int
    # instructions: list of string
    # img_url: string

    print('parsing detail id={}'.format(id_))

    r = soup.select_one('div[itemtype="http://data-vocabulary.org/Recipe"]')
    title = r.select_one('h2[itemprop="name"]').text
    # assuming one image per recipe
    img_url = BASE_URL + r.select_one('div#recipe_photo img').attrs['src']

    # comment & prep_duration
    s = r.select_one('p[itemprop="summary"]').text
    # s:= '根菜をたっぷりとれるヘルシーおかず。\n\n【準備時間：15分】'
    m = re.search(r'【準備時間：(\d+)分】', s)
    prep_duration = int(m.group(1)) if m is not None else None
    comment = s.split('【')[0].strip()

    cook_duration = int(r.select_one('div#recipe_content ul li.r_time time').text.strip()[:-1])
    calorie = int(r.select_one('div#recipe_content ul li.cal').text.split('：')[-1][:-4])
    genre = r.select_one('div#recipe_content ul li.genre').text.split('：')[-1].strip()
    difficulty = r.select_one('div#recipe_content ul li.level').text.split('：')[-1].strip()
    instructions = [elem.text.strip() for elem in r.select('div#recipe_howto ul li')]

    m = re.match(r'\((\d+)人分', r.select_one('div[itemprop="ingredient"] span[itemprop="yield"]').text)
    yield_ = int(m.group(1)) if m is not None else None

    ing_elem = r.select_one('div[itemprop="ingredient"] dl')

    # special treatment for only_dt (where amount is omitted)
    for elem in ing_elem.select('dt.only_dt'):
        if '）' in elem.text: # fix for ID: 1525
            name, amt = elem.text.split('）')
            elem.string = name + '）'
        else:
            name, amt = elem.text, ''
        dd = soup.new_tag('dd')
        dd.string = amt
        elem.insert_after(dd)

    ingredients = [{'name': name.text.strip(), 'amount': amt.text.strip()} for name, amt in zip(ing_elem.select('dt'), ing_elem.select('dd'))]
    for ingredient in ingredients:
        name = ingredient['name']
        marking = None
        if '-' in name:
            name, marking = name.split('-')
            ingredient['name'] = name.strip()
            ingredient['marking'] = marking.strip()
        if name.count('（') == 1:
            name, detail = name.split('（')
            if not detail.endswith('）'):
                print(ingredient['name'])
                import pdb; pdb.set_trace()
            detail = detail[:-1] # remove closing paren
            ingredient['name'] = name.strip()
            ingredient['detail'] = detail.strip()
        elif '（' in ingredient:
            print(ingredient['name'])
            import pdb; pdb.set_trace()

    return {
        'id': id_,
        'title': title,
        'cook_duration': cook_duration,
        'prep_duration': prep_duration,
        'img_url': img_url,
        'comment': comment,
        'calorie': calorie,
        'genre': genre,
        'difficulty': difficulty,
        'instructions': instructions,
        'yield': yield_,
        'ingredients': ingredients
    }


 def extract_ids(soup):
    res = []
    for elem in soup.select('div.recipe_item p.text a'):
        # elem := "<a href="/recipe/detail/1469/">野菜の肉巻き</a>"
        res.append(int(elem.attrs['href'].split('/')[-2]))
    return res


 def obtain_detail(id_):
    soup = get(DETAIL_URL.format(id_))
    return extract_detail(soup, id_)


 def obtain_all_ids():
    res = []
    for page in range(*LIST_PAGES):
        soup = get(LIST_URL.format(page))
        res += extract_ids(soup)
    return sorted(res)


 if __name__ == '__main__':
    recipes = []
    ids = obtain_all_ids()
    for id_ in ids:
        detail = obtain_detail(id_)
        pprint(detail)
        recipes.append(detail)
    json.dump(recipes, open('recipes.json', 'w'), ensure_ascii=False)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	from urllib.request import urlopen
	import re
	import json
	from pprint import pprint
	from bs4 import BeautifulSoup

	BASE_URL = 'http://www.club.t-fal.co.jp'
	LIST_URL = BASE_URL + '/recipe/category/c4m-express/{:d}/'
	DETAIL_URL = BASE_URL + '/recipe/detail/{:d}/'
	LIST_PAGES = (1, 11)


	def get(url):
	return BeautifulSoup(urlopen(url).read().decode('utf-8'), 'lxml')


	def extract_detail(soup, id_):
	# id: int
	# title: string
	# cook_duration: int (in minutes)
	# calorie: int (in kcal)
	# genre : string
	# difficulty : string
	# prep_duration: int (in minutes)
	# comment: string
	# ingredients: list of (name: string * amount: string)
	# yield: int
	# instructions: list of string
	# img_url: string

	print('parsing detail id={}'.format(id_))

	r = soup.select_one('div[itemtype="http://data-vocabulary.org/Recipe"]')
	title = r.select_one('h2[itemprop="name"]').text
	# assuming one image per recipe
	img_url = BASE_URL + r.select_one('div#recipe_photo img').attrs['src']

	# comment & prep_duration
	s = r.select_one('p[itemprop="summary"]').text
	# s:= '根菜をたっぷりとれるヘルシーおかず。\n\n【準備時間：15分】'
	m = re.search(r'【準備時間：(\d+)分】', s)
	prep_duration = int(m.group(1)) if m is not None else None
	comment = s.split('【')[0].strip()

	cook_duration = int(r.select_one('div#recipe_content ul li.r_time time').text.strip()[:-1])
	calorie = int(r.select_one('div#recipe_content ul li.cal').text.split('：')[-1][:-4])
	genre = r.select_one('div#recipe_content ul li.genre').text.split('：')[-1].strip()
	difficulty = r.select_one('div#recipe_content ul li.level').text.split('：')[-1].strip()
	instructions = [elem.text.strip() for elem in r.select('div#recipe_howto ul li')]

	m = re.match(r'\((\d+)人分', r.select_one('div[itemprop="ingredient"] span[itemprop="yield"]').text)
	yield_ = int(m.group(1)) if m is not None else None

	ing_elem = r.select_one('div[itemprop="ingredient"] dl')

	# special treatment for only_dt (where amount is omitted)
	for elem in ing_elem.select('dt.only_dt'):
	if '）' in elem.text: # fix for ID: 1525
	name, amt = elem.text.split('）')
	elem.string = name + '）'
	else:
	name, amt = elem.text, ''
	dd = soup.new_tag('dd')
	dd.string = amt
	elem.insert_after(dd)

	ingredients = [{'name': name.text.strip(), 'amount': amt.text.strip()} for name, amt in zip(ing_elem.select('dt'), ing_elem.select('dd'))]
	for ingredient in ingredients:
	name = ingredient['name']
	marking = None
	if '-' in name:
	name, marking = name.split('-')
	ingredient['name'] = name.strip()
	ingredient['marking'] = marking.strip()
	if name.count('（') == 1:
	name, detail = name.split('（')
	if not detail.endswith('）'):
	print(ingredient['name'])
	import pdb; pdb.set_trace()
	detail = detail[:-1] # remove closing paren
	ingredient['name'] = name.strip()
	ingredient['detail'] = detail.strip()
	elif '（' in ingredient:
	print(ingredient['name'])
	import pdb; pdb.set_trace()

	return {
	'id': id_,
	'title': title,
	'cook_duration': cook_duration,
	'prep_duration': prep_duration,
	'img_url': img_url,
	'comment': comment,
	'calorie': calorie,
	'genre': genre,
	'difficulty': difficulty,
	'instructions': instructions,
	'yield': yield_,
	'ingredients': ingredients
	}


	def extract_ids(soup):
	res = []
	for elem in soup.select('div.recipe_item p.text a'):
	# elem := "<a href="/recipe/detail/1469/">野菜の肉巻き</a>"
	res.append(int(elem.attrs['href'].split('/')[-2]))
	return res


	def obtain_detail(id_):
	soup = get(DETAIL_URL.format(id_))
	return extract_detail(soup, id_)


	def obtain_all_ids():
	res = []
	for page in range(*LIST_PAGES):
	soup = get(LIST_URL.format(page))
	res += extract_ids(soup)
	return sorted(res)


	if __name__ == '__main__':
	recipes = []
	ids = obtain_all_ids()
	for id_ in ids:
	detail = obtain_detail(id_)
	pprint(detail)
	recipes.append(detail)
	json.dump(recipes, open('recipes.json', 'w'), ensure_ascii=False)