Created
May 5, 2011 00:04
-
-
Save jmoiron/956284 to your computer and use it in GitHub Desktop.
silly stats on amit's blog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
"""post stats on amit just because""" | |
import re | |
import urllib2 | |
from lxml import html | |
url = 'http://omlettesoft.com/newjournal.php3?topic=On+the+Waterfront&who=Lord+Omlette' | |
class Blog(object): | |
def __init__(self, dom): | |
self.dom = dom | |
self.entries = dom.cssselect('div.entry') | |
self.posts = [Post(e) for e in self.entries] | |
self.total_count = sum([p.total for p in self.posts]) | |
self.total_om_count = sum([p.om_count for p in self.posts]) | |
self.total_quote_count = sum([p.quote_count for p in self.posts]) | |
self.total_jerm_count = sum([p.jerm_count for p in self.posts]) | |
self.total_jing_count = sum([p.jing_count for p in self.posts]) | |
class Post(object): | |
def __init__(self, dom): | |
self.dom = dom | |
self.process() | |
def word_count(self, elements): | |
"""Take a naive word count of a list of elements.""" | |
isword = re.compile('\w+') # a word is anything with letters | |
return sum([len(filter(None, map(isword.search, e.text_content().split()))) | |
for e in elements]) | |
def process(self): | |
post = self.dom.cssselect('div.post')[0] | |
self.total = self.word_count([post]) | |
self.quote_count = self.word_count(post.cssselect('blockquote')) | |
self.om_count = self.total - self.quote_count | |
self.jerm_count, self.jing_count = 0, 0 | |
for anchor in post.cssselect('a'): | |
text = anchor.text_content().strip().lower() | |
if text == 'jerm9x': self.jerm_count += 1 | |
if text == 'jing': self.jing_count += 1 | |
def load_dom(url): | |
"""Load a url and return a dom element for that url.""" | |
# for some reason i don't trust lxml to read things off the internets | |
response = urllib2.urlopen(url) | |
return html.document_fromstring(response.read()) | |
def main(): | |
opts, args = parse_args() | |
dom = load_dom(url) | |
blog = Blog(dom) | |
print " ** stats for %s ** " % url | |
print " %d posts with %d total words" % (len(blog.posts), blog.total_count) | |
print " %d words in blockquote, %d original words (%0.2f%% quotations)" % ( | |
blog.total_quote_count, blog.total_om_count, | |
100 * float(blog.total_quote_count)/float(blog.total_count) | |
) | |
print " %d linked mentions of jerm9x, %d of jing" % ( | |
blog.total_jerm_count, blog.total_jing_count | |
) | |
def parse_args(): | |
from optparse import OptionParser | |
parser = OptionParser(version='1.0', usage='%prog') | |
return parser.parse_args() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment