Created
October 2, 2012 05:51
-
-
Save thedjpetersen/3816489 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import urllib2 | |
base_url = "http://{0}.blogspot.com" | |
def grab_url(url): | |
response = urllib2.urlopen(url) | |
return html.fromstring(response.read()) | |
def get_html(tree): | |
return html.tostring(tree) | |
class Blog: | |
posts = [] | |
def __init__(self, blog_name): | |
self.blog_name = blog_name | |
self.blog_url = base_url.format(blog_name) | |
def parse_post(self, post): | |
post_data = {} | |
post_data['human_date'] = post.cssselect('h2.date-header')[0].text_content() | |
post_data['date'] = post.cssselect('abbr.published')[0].get('title') | |
post_data['url'] = post.cssselect('a.timestamp-link')[0].get('href') | |
title = post.cssselect('h3.post-title') | |
if len(title): | |
title = title[0] | |
post_data['title'] = title.text_content().replace("\n", "") | |
else: | |
post_data['title'] = 'Thoughts' | |
self.get_post_content(post_data) | |
print post_data | |
return post_data | |
def get_post_content(self, post): | |
document = grab_url(post['url']) | |
post_body = get_html(document.cssselect('div.post-body')[0]) | |
post['html'] = post_body | |
def get_posts(self, link=None): | |
url = link if link else self.blog_url | |
document = grab_url(url) | |
posts = document.cssselect('div.date-outer') | |
for post in posts: | |
self.posts.append(self.parse_post(post)) | |
older_link = document.cssselect('a.blog-pager-older-link') | |
if len(older_link): | |
self.get_posts(older_link[0].get('href')) | |
return self.posts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Little wrapper to grab blogger posts - needs to be asynchronous.