Created
October 12, 2012 01:55
-
-
Save tdhopper/3876912 to your computer and use it in GitHub Desktop.
AllRecipes to Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python script to scrape a recipe from AllRecipes.com into a Markdown-ish file. # | |
# Takes an AllRecipes.com URL as an argument. # | |
# I connected this to Alfred App so I simply have to paste a URL into Alfred. | |
import subprocess | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re, string, sys, os | |
recipeDirection = os.path.expanduser("~/Dropbox/Text Notes/") | |
from HTMLParser import HTMLParser | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data() | |
# Stripping code from http://stackoverflow.com/a/925630/982745 | |
if (len(sys.argv) <= 1): | |
print "Please include address as argument." | |
exit() | |
url = sys.argv[1] | |
soup = BeautifulSoup(urllib2.urlopen(url).read()) | |
saveout = sys.stdout | |
title = recipeDirection+strip_tags(str(soup.find("h1",{"id":"itemTitle"})))+".txt" | |
fsock = open(title, 'w') | |
sys.stdout = fsock | |
# print url | |
print sys.argv[1] | |
# print number of servings | |
soup2 = soup.find("div", {"class": "servings"}).find("span", {"id": "lblYield"}) | |
print strip_tags(soup2.__str__()) | |
#print soup.find("div", {"id": "divRecipeTimesContainer"}) | |
times = soup.find("div", {"id": "divRecipeTimesContainer"}).findAll("span") | |
#print times | |
times = [strip_tags(str(X).replace("\r","")) for X in soup.find("div", {"id": "divRecipeTimesContainer"}).findAll("li")] | |
for x in times: | |
print ' '.join(x.split()).replace("PREP","Prep Time:").replace("COOK","Cook Time:").replace("READY IN", "Ready in:") | |
print "\nIngredients:\n" | |
ingredients = soup.find("ul", {"class": "ingredient-wrap"}).findAll("li") | |
for i in ingredients: | |
print "* " + ' '.join(strip_tags(str(i)).split()) | |
print "\nDirections:\n" | |
soup2 = soup.find("div", {"class": "directions"}).find("ol") | |
directions = soup2('li') | |
for idx, direction in enumerate(directions): | |
print str(idx+1) + ". " + strip_tags(direction.__str__()).lstrip().rstrip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment