atucom · August 2, 2018 22:42
diff --git a/parsehtml.py b/parsehtml.py
 from lxml import html
 import lxml
 import os
 from collections import defaultdict

 def getIngredients(htmlFile):
    # Returns the ingredients from an html file
    try:
        tree = html.fromstring(htmlFile)
    except lxml.etree.ParserError:
        return "NOPE"
    return [ingredient.text_content() for ingredient in tree.xpath('//ul[@class="list--unstyled group position--relative text--center--bpDown2"]/*')]

 def cleanIngredientList(ingredientList):
    # Cleans the list to prettify it
    cleaned = []
    for i in ingredientList:
        if "Info" in i:
            cleaned.append(i.split('\n')[8])
        if len(i) > 2:
            cleaned.append(i.split('\n')[3])
    return cleaned

 def sortAndCount(ingredientList,top=100):
    # returns the top 10 count of ingredients
    counts = defaultdict(int)
    for x in ingredientList:
        counts[x] += 1
    return sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:top]

 def main():
    files = os.listdir()
    ingredients = []
    for file in files:
        with open(file) as f:
            htmlFile = f.read()
        ingredients.append(cleanIngredientList(getIngredients(htmlFile)))
    a = sum(ingredients, []) #flatten the array so you can count it
    counted = sortAndCount(a)
    for i in counted:
        print(str(i[1]) + " " + str(i[0]))

 if __name__ == '__main__':
    main()
	from lxml import html
	import lxml
	import os
	from collections import defaultdict

	def getIngredients(htmlFile):
	# Returns the ingredients from an html file
	try:
	tree = html.fromstring(htmlFile)
	except lxml.etree.ParserError:
	return "NOPE"
	return [ingredient.text_content() for ingredient in tree.xpath('//ul[@class="list--unstyled group position--relative text--center--bpDown2"]/*')]

	def cleanIngredientList(ingredientList):
	# Cleans the list to prettify it
	cleaned = []
	for i in ingredientList:
	if "Info" in i:
	cleaned.append(i.split('\n')[8])
	if len(i) > 2:
	cleaned.append(i.split('\n')[3])
	return cleaned

	def sortAndCount(ingredientList,top=100):
	# returns the top 10 count of ingredients
	counts = defaultdict(int)
	for x in ingredientList:
	counts[x] += 1
	return sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:top]

	def main():
	files = os.listdir()
	ingredients = []
	for file in files:
	with open(file) as f:
	htmlFile = f.read()
	ingredients.append(cleanIngredientList(getIngredients(htmlFile)))
	a = sum(ingredients, []) #flatten the array so you can count it
	counted = sortAndCount(a)
	for i in counted:
	print(str(i[1]) + " " + str(i[0]))

	if __name__ == '__main__':
	main()