Created
August 2, 2018 22:42
-
-
Save atucom/5148e9a88a13ee732cb64422c74a9d22 to your computer and use it in GitHub Desktop.
Parse HTML for ingredient list, customized for homechef
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import lxml | |
import os | |
from collections import defaultdict | |
def getIngredients(htmlFile): | |
# Returns the ingredients from an html file | |
try: | |
tree = html.fromstring(htmlFile) | |
except lxml.etree.ParserError: | |
return "NOPE" | |
return [ingredient.text_content() for ingredient in tree.xpath('//ul[@class="list--unstyled group position--relative text--center--bpDown2"]/*')] | |
def cleanIngredientList(ingredientList): | |
# Cleans the list to prettify it | |
cleaned = [] | |
for i in ingredientList: | |
if "Info" in i: | |
cleaned.append(i.split('\n')[8]) | |
if len(i) > 2: | |
cleaned.append(i.split('\n')[3]) | |
return cleaned | |
def sortAndCount(ingredientList,top=100): | |
# returns the top 10 count of ingredients | |
counts = defaultdict(int) | |
for x in ingredientList: | |
counts[x] += 1 | |
return sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:top] | |
def main(): | |
files = os.listdir() | |
ingredients = [] | |
for file in files: | |
with open(file) as f: | |
htmlFile = f.read() | |
ingredients.append(cleanIngredientList(getIngredients(htmlFile))) | |
a = sum(ingredients, []) #flatten the array so you can count it | |
counted = sortAndCount(a) | |
for i in counted: | |
print(str(i[1]) + " " + str(i[0])) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment