Created
April 10, 2012 08:48
-
-
Save moriyoshi/2349448 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urlparse | |
import lxml.html as html | |
import sys | |
import json | |
BLOG_URL = 'http://torufurukawa.blogspot.jp/' | |
def fetch(url): | |
print >>sys.stderr, 'fetcing %s...' % url | |
retry_count = 0 | |
while True: | |
try: | |
return html.parse(url) | |
except: | |
if retry_count >= 3: | |
raise | |
retry_count += 1 | |
def scrape(url): | |
docs = {} | |
posts = {} | |
def fetch_archives(url): | |
if url in docs: | |
return | |
doc = fetch(url) | |
docs[url] = doc | |
archives = [n.get('href') for n in doc.xpath("//*[@id='BlogArchive1']//ul[@class='hierarchy']//ul[@class='hierarchy']/li/a[not(starts-with(@href, 'javascript:'))]")] | |
for archive in archives: | |
fetch_archives(archive) | |
def extract_posts(doc): | |
blog_widget = doc.xpath("//*[contains(@class, 'widget') and substring-after(@class, ' ')='Blog']") | |
posts_per_day = blog_widget[0].findall("*//*[@class='date-outer']//*[@class='post-outer']/*") | |
for post in posts_per_day: | |
date_published_node = post.find("*[@class='post-footer']//*[@class='post-timestamp']//abbr[@itemprop='datePublished']") | |
body_node = post.xpath("*[contains(@class,'post-body')]")[0] | |
date_published = date_published_node.get('title') if date_published_node is not None else None | |
body = body_node.text_content() if body_node is not None else None | |
posts[date_published] = body | |
fetch_archives(url) | |
for doc in docs.itervalues(): | |
extract_posts(doc) | |
return posts | |
sys.stdout.write(json.dumps(scrape(BLOG_URL), indent=True, ensure_ascii=False).encode('utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment