moriyoshi · April 10, 2012 08:48
diff --git a/fetch_blog.py b/fetch_blog.py
 import urlparse
 import lxml.html as html
 import sys
 import json

 BLOG_URL = 'http://torufurukawa.blogspot.jp/'

 def fetch(url):
    print >>sys.stderr, 'fetcing %s...' % url
    retry_count = 0
    while True:
        try:
            return html.parse(url)
        except:
            if retry_count >= 3:
                raise
        retry_count += 1


 def scrape(url):
    docs = {}
    posts = {}
    def fetch_archives(url):
        if url in docs:
            return
        doc = fetch(url)
        docs[url] = doc
        archives = [n.get('href') for n in doc.xpath("//*[@id='BlogArchive1']//ul[@class='hierarchy']//ul[@class='hierarchy']/li/a[not(starts-with(@href, 'javascript:'))]")]
        for archive in archives:
            fetch_archives(archive)

    def extract_posts(doc):
        blog_widget = doc.xpath("//*[contains(@class, 'widget') and substring-after(@class, ' ')='Blog']")
        posts_per_day = blog_widget[0].findall("*//*[@class='date-outer']//*[@class='post-outer']/*")
        for post in posts_per_day:
            date_published_node = post.find("*[@class='post-footer']//*[@class='post-timestamp']//abbr[@itemprop='datePublished']")
            body_node = post.xpath("*[contains(@class,'post-body')]")[0]
            date_published = date_published_node.get('title') if date_published_node is not None else None
            body = body_node.text_content() if body_node is not None else None
            posts[date_published] = body

    fetch_archives(url)

    for doc in docs.itervalues():
        extract_posts(doc)

    return posts

 sys.stdout.write(json.dumps(scrape(BLOG_URL), indent=True, ensure_ascii=False).encode('utf-8'))
	import urlparse
	import lxml.html as html
	import sys
	import json

	BLOG_URL = 'http://torufurukawa.blogspot.jp/'

	def fetch(url):
	print >>sys.stderr, 'fetcing %s...' % url
	retry_count = 0
	while True:
	try:
	return html.parse(url)
	except:
	if retry_count >= 3:
	raise
	retry_count += 1


	def scrape(url):
	docs = {}
	posts = {}
	def fetch_archives(url):
	if url in docs:
	return
	doc = fetch(url)
	docs[url] = doc
	archives = [n.get('href') for n in doc.xpath("//*[@id='BlogArchive1']//ul[@class='hierarchy']//ul[@class='hierarchy']/li/a[not(starts-with(@href, 'javascript:'))]")]
	for archive in archives:
	fetch_archives(archive)

	def extract_posts(doc):
	blog_widget = doc.xpath("//*[contains(@class, 'widget') and substring-after(@class, ' ')='Blog']")
	posts_per_day = blog_widget[0].findall("//[@class='date-outer']//[@class='post-outer']/")
	for post in posts_per_day:
	date_published_node = post.find("[@class='post-footer']//[@class='post-timestamp']//abbr[@itemprop='datePublished']")
	body_node = post.xpath("*[contains(@class,'post-body')]")[0]
	date_published = date_published_node.get('title') if date_published_node is not None else None
	body = body_node.text_content() if body_node is not None else None
	posts[date_published] = body

	fetch_archives(url)

	for doc in docs.itervalues():
	extract_posts(doc)

	return posts

	sys.stdout.write(json.dumps(scrape(BLOG_URL), indent=True, ensure_ascii=False).encode('utf-8'))