Last active
December 10, 2015 13:48
-
-
Save tvwerkhoven/4443142 to your computer and use it in GitHub Desktop.
Produce energy label for Albert Heijn recipes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"Framboos": [["C", "Spanje"], ["E", "Mexico"]], "Nectarine": [["B", "Chili"]], "Limoen": [["A", "Brazili\u00eb"]], "Avocado": [["B", "Isra\u00ebl"], ["C", "Chili"], ["C", "Spanje"]], "Wortelen (breekpeen/waspeen)": [["A", "Nederland"]], "Carambola": [["D", "Isra\u00ebl"], ["E", "Maleisi\u00eb"]], "Radijs (los)": [["B", "Nederland"]], "Sperzieboon": [["B", "Marokko"], ["B", "Spanje"], ["C", "Senegal"], ["D", "Egypte"], ["E", "Kenia"]], "Kokosnoot": [["C", "Ivoorkust"], ["C", "Sri Lanka"], ["D", "Algerije"]], "Broccoli": [["B", "Frankrijk"], ["B", "Spanje"]], "Witte kool": [["A", "Nederland"]], "Lychee": [["C", "Madagaskar"], ["C", "Zuid-Afrika"]], "Rabarber": [["C", "Nederland"]], "Asperge (groen)": [["E", "Peru"]], "Courgette": [["A", "Spanje"]], "Rode biet": [["A", "Nederland"]], "Andijvie (gewoon)": [["A", "Spanje"], ["E", "Nederland"]], "Spitskool": [["A", "Nederland"], ["A", "Spanje"], ["B", "Portugal"]], "Ananas": [["B", "Costa Rica"], ["B", "Ecuador"]], "Kersen": [["E", "Chili"]], "Artisjok": [["B", "Spanje"]], "Dadel (vers)": [["C", "Isra\u00ebl"], ["D", "Tunesi\u00eb"]], "Rucola": [["A", "Itali\u00eb"]], "Rode bes": [["D", "Nederland"]], "Tuinboon": [["B", "Groot-Brittanni\u00eb"]], "Grapefruit": [["A", "Verenigde Staten"], ["B", "China"]], "Kiwi": [["B", "Itali\u00eb"]], "Veldsla": [["A", "Itali\u00eb"], ["A", "Nederland"]], "Oesterzwam": [["C", "Nederland"]], "Asperge (wit)": [["E", "Peru"]], "Aardappelen": [["A", "Belgi\u00eb"], ["A", "Duitsland"], ["A", "Frankrijk"], ["A", "Nederland"]], "Radijs": [["B", "Isra\u00ebl"]], "Venkelknol": [["A", "Itali\u00eb"], ["A", "Spanje"]], "Knolselderij": [["A", "Nederland"]], "Winterpostelein": [["A", "Nederland"]], "Bleekselderij": [["A", "Spanje"]], "Pastinaak": [["A", "Nederland"]], "Ui": [["A", "Belgi\u00eb"], ["A", "Nederland"], ["A", "Polen"]], "Trostomaat": [["B", "Isra\u00ebl"], ["B", "Spanje"]], "Sharonfruit (Kaki)": [["C", "Isra\u00ebl"]], "Cherrytomaat": [["B", "Isra\u00ebl"], ["B", "Marokko"], ["B", "Senegal"], ["B", "Spanje"]], "Mango": [["C", "Peru"]], "Dadel (gekonfijt/gedroogd)": [["C", "Californi\u00eb"], ["C", "Iran"], ["C", "Isra\u00ebl"], ["D", "Algerije"], ["D", "Tunesi\u00eb"]], "Rammenas": [["A", "Nederland"]], "Banaan": [["A", "Colombia"], ["A", "Costa Rica"], ["A", "Ecuador"]], "Aardbeien": [["C", "Spanje"], ["D", "Egypte"], ["D", "Isra\u00ebl"]], "Cherrytrostomaat": [["B", "Isra\u00ebl"], ["B", "Itali\u00eb"], ["B", "Spanje"]], "Bloemkool": [["A", "Nederland"], ["B", "Frankrijk"], ["B", "Spanje"]], "Rode kool": [["A", "Nederland"]], "IJsbergsla": [["A", "Spanje"]], "Aubergine": [["B", "Spanje"]], "Romatomaat": [["B", "Spanje"]], "Appel": [["A", "Frankrijk"], ["A", "Nederland"]], "Abrikoos": [["B", "Zuid-Afrika"]], "Pruim": [["B", "Zuid-Afrika"]], "Aardpeer": [["A", "Nederland"], ["B", "China"], ["B", "Costa Rica"], ["B", "Ghana"]], "Boerenkool": [["A", "Nederland"], ["A", "Spanje"]], "Passievrucht": [["E", "Colombia"], ["E", "Kenia"], ["E", "Maleisi\u00eb"], ["E", "Zimbabwe"]], "Blauwe bes": [["E", "Chili"], ["E", "Nederland"], ["E", "Nieuw-Zeeland"]], "Paprika": [["A", "Spanje"], ["B", "Isra\u00ebl"]], "Snijboon": [["B", "Marokko"], ["B", "Spanje"], ["C", "Senegal"], ["D", "Egypte"], ["E", "Kenia"]], "Mandarijn": [["A", "Marokko"], ["B", "Spanje"]], "Zoete Aardappel": [["A", "Verenigde Staten"], ["B", "China"]], "Pompoen": [["A", "Frankrijk"], ["A", "Spanje"], ["B", "Nederland"], ["B", "Portugal"]], "Vleestomaat": [["B", "Spanje"]], "Peultjes": [["B", "Marokko"], ["D", "Egypte"], ["E", "Guatamala"], ["E", "Kenia"]], "Doperwt (vers)": [["E", "Guatamala"], ["E", "Kenia"]], "Koolraap": [["A", "Nederland"]], "Prei": [["A", "Belgi\u00eb"], ["A", "Nederland"]], "Mais": [["B", "Verenigde Staten"]], "Ronde tomaat": [["B", "Marokko"], ["B", "Spanje"]], "Spruiten": [["A", "Belgi\u00eb"], ["A", "Duitsland"], ["A", "Nederland"]], "Miniromatomaat": [["B", "Spanje"], ["C", "Nederland"]], "Perzik": [["B", "Chili"], ["B", "Zuid-Afrika"]], "Peer (stoofpeer)": [["A", "Nederland"]], "Schorseneren": [["A", "Nederland"]], "Granaatappel": [["B", "Itali\u00eb"], ["B", "Spanje"]], "Peer (handpeer)": [["A", "Belgi\u00eb"], ["A", "Nederland"], ["A", "Zuid-Afrika"]], "Meloen": [["B", "Brazili\u00eb"], ["B", "Honduras"]], "Bosui": [["A", "Frankrijk"]], "Druif": [["B", "Zuid-Afrika"]], "Witlof": [["A", "Belgi\u00eb"], ["A", "Itali\u00eb"], ["A", "Nederland"], ["A", "Spanje"]], "Zeekraal": [["D", "Isra\u00ebl"], ["E", "Mexico"]], "Koolrabi": [["A", "Itali\u00eb"]], "Chinese kool": [["A", "Nederland"], ["A", "Oostenrijk"]], "Savooie kool": [["A", "Nederland"]], "Wortelen (bospeen)": [["A", "Spanje"]], "Papaja": [["E", "Brazili\u00eb"], ["E", "Ecuador"]], "Wortelen (winterpeen)": [["A", "Nederland"]], "Rettich": [["A", "Itali\u00eb"], ["A", "Nederland"]], "Citroen": [["B", "Spanje"]], "Radicchio": [["B", "Itali\u00eb"]], "Radijs (bos)": [["A", "Nederland"]], "Andijvie (krul)": [["A", "Frankrijk"]], "Vijg": [["E", "Brazili\u00eb"]], "Raapjes": [["A", "Frankrijk"], ["A", "Itali\u00eb"]], "Braam": [["E", "Mexico"]], "Komkommer": [["A", "Spanje"]], "Kropsla": [["A", "Spanje"], ["D", "Nederland"]], "Sinaasappel": [["A", "Spanje"], ["B", "Marokko"]], "Spinazie": [["A", "Spanje"]], "Champignon": [["B", "Nederland"], ["D", "Belgi\u00eb"], ["D", "Polen"]], "Babymais": [["E", "Thailand"]]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
# -*- coding: utf-8 -*- | |
""" | |
@file recept_label.py -- produce energy label for Albert Heijn recipes | |
@author Tim van Werkhoven | |
@date 20130103 | |
@copyright Copyright (c) 2013 Tim van Werkhoven <[email protected]> | |
This file is licensed under the Creative Commons Attribution-Share Alike | |
license versions 3.0 or higher, see | |
http://creativecommons.org/licenses/by-sa/3.0/ | |
""" | |
############################################################################# | |
### PREAMBLE | |
############################################################################# | |
import urllib2, urllib | |
from cookielib import CookieJar | |
from bs4 import BeautifulSoup | |
import re | |
import difflib | |
import bz2 | |
import json, os | |
def unit_per_kg(inunit): | |
""" | |
Convert **inunit** to kilograms | |
""" | |
if (inunit == 'kg'): return 1.0 | |
if (inunit == 'g'): return 1e-3 | |
if (inunit == 'l'): return 1.0 | |
if (inunit == 'ml'): return 1e-3 | |
else: return None | |
### Get recipe, find ingredients | |
INURL="http://www.ah.nl/allerhande/recepten/882692/boerenkool-venkelstamppot-met-kip?latestAllerhande=on" | |
ah_recipe = BeautifulSoup(urllib2.urlopen(INURL).read()) | |
ing_list = [(ing.string.strip(), ing['data-search-term']) for ing in ah_recipe('span', {'class' : 'ingredient'})] | |
### Compute normalized ingredient quantities | |
# For each ingredient, find the quantity, look for ([0-9]+)[ ]([kg|g|l|ml]) | |
qty_re = re.compile('([0-9]+) (kg|g|ml|l)') | |
# Loop over ingredient, find quantity, convert | |
ing_list2 = [] | |
for ing, ingname in ing_list: | |
qty = qty_re.search(ing) | |
if (not qty): | |
print "not parsed:", ing | |
ing_list2.append((ing, ingname, None)) | |
continue | |
print ing, qty.groups() | |
normqty = float(qty.groups()[0]) * unit_per_kg(qty.groups()[1]) | |
ing_list2.append((ing, ingname, normqty)) | |
### Get milieucentraal data from web -- DOES NOT WORK, COOKIES? | |
# cj = CookieJar() | |
# opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPRedirectHandler()) | |
# milieu_url = 'http://groentefruit.milieucentraal.nl/Groente-en-Fruit-Kalender/groente-en-fruit-kalender/zoeken/formulier' | |
# milieu_parms = urllib.urlencode({'questionid': "5010cd30", 'Maand': 'januari', 'Energieklasse': 'A', 'Energieklasse': 'B', 'Energieklasse': 'C', 'Energieklasse': 'D', 'Energieklasse': 'E', 'next' : u'Toon+resultaat+%C2%BB'}) | |
# milieu_parms = "focusfield=&scrollx=&scrolly=&dummyFieldForIEEnterSubmitSupport=&questionid=5010cd30&Producttype=&Maand=januari&Product=&Energieklasse=A&Energieklasse=B&Energieklasse=C&Energieklasse=D&Energieklasse=E&finish=Toon+resultaat+%C2%BB" | |
# #POSTDATA=focusfield=&scrollx=&scrolly=&dummyFieldForIEEnterSubmitSupport=&questionid=5010cd30&Producttype=&Maand=januari&Product=&Energieklasse=A&Energieklasse=B&Energieklasse=C&Energieklasse=D&Energieklasse=E&finish=Toon+resultaat+%C2%BB | |
# milieu_open = opener.open(milieu_url, milieu_parms) | |
# milieu_data = BeautifulSoup(milieu_open.read()) | |
### Get milieucentraal data | |
JSONFILE = './data/milieudata_januari.json' | |
if (os.path.isfile(JSONFILE)): | |
# Load data from previously parsed file | |
print "Loading previously parsed JSON file" | |
fd = open(JSONFILE, 'r') | |
milieu_labels = json.load(fd) | |
fd.close() | |
else: | |
# Get milieucentraal data from local file | |
print "Parsing local data file" | |
fd = bz2.BZ2File("./data/milieudata_januari.html.bz2") | |
milieu_data = BeautifulSoup(fd.read()) | |
fd.close() | |
### Format energy labels | |
# Loop over products, extract name, energy labels and countries | |
milieu_labels = {} | |
for prod in milieu_data('td', {'class' : 'product'}): | |
prod_name = prod.contents[0] | |
# Find energy labels for each country where they are produced | |
milieu_labels[prod_name] = [(c.a['class'][-1], c.a.contents[-1]) for c in prod.nextSibling('div', {'class': 'tooltiparent'})] | |
# The above code is equivalent to: | |
# prod_country = prod.nextSibling('div', {'class': 'tooltiparent'}) | |
# this_c = prod_country[0].a.contents[-1] | |
# this_e1 = prod_country[0].a['class'][-1] | |
# Energy labels are also here: | |
# prod_energy = prod.nextSibling('td', {'class' : 'energy'}) | |
# this_e0 = prod_energy[0].div['class'][-1] | |
# but this is less robust, there are multiple countries per label | |
# Store labels as JSON | |
fd = open(JSONFILE, 'w') | |
json.dump(milieu_labels, fd) | |
fd.close() | |
### Match ingredient names to energy labels product names | |
for ingfull, ing, norm_qty in ing_list2: | |
bestprod = difflib.get_close_matches(ing, milieu_labels.keys()) or None | |
if (bestprod): | |
print ingfull, ing, norm_qty, bestprod[0], milieu_labels[bestprod[0]] | |
else: | |
print ingfull, ing, norm_qty, "no energy data" | |
# EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment