Skip to content

Instantly share code, notes, and snippets.

@moriyoshi
Created April 10, 2012 08:48
Show Gist options
  • Save moriyoshi/2349448 to your computer and use it in GitHub Desktop.
Save moriyoshi/2349448 to your computer and use it in GitHub Desktop.
import urlparse
import lxml.html as html
import sys
import json
BLOG_URL = 'http://torufurukawa.blogspot.jp/'
def fetch(url):
print >>sys.stderr, 'fetcing %s...' % url
retry_count = 0
while True:
try:
return html.parse(url)
except:
if retry_count >= 3:
raise
retry_count += 1
def scrape(url):
docs = {}
posts = {}
def fetch_archives(url):
if url in docs:
return
doc = fetch(url)
docs[url] = doc
archives = [n.get('href') for n in doc.xpath("//*[@id='BlogArchive1']//ul[@class='hierarchy']//ul[@class='hierarchy']/li/a[not(starts-with(@href, 'javascript:'))]")]
for archive in archives:
fetch_archives(archive)
def extract_posts(doc):
blog_widget = doc.xpath("//*[contains(@class, 'widget') and substring-after(@class, ' ')='Blog']")
posts_per_day = blog_widget[0].findall("*//*[@class='date-outer']//*[@class='post-outer']/*")
for post in posts_per_day:
date_published_node = post.find("*[@class='post-footer']//*[@class='post-timestamp']//abbr[@itemprop='datePublished']")
body_node = post.xpath("*[contains(@class,'post-body')]")[0]
date_published = date_published_node.get('title') if date_published_node is not None else None
body = body_node.text_content() if body_node is not None else None
posts[date_published] = body
fetch_archives(url)
for doc in docs.itervalues():
extract_posts(doc)
return posts
sys.stdout.write(json.dumps(scrape(BLOG_URL), indent=True, ensure_ascii=False).encode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment