Created
August 12, 2010 00:56
-
-
Save bkeating/520107 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import datetime as dt | |
import dateutil.parser as dparser | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
print "<?xml version=\"1.0\" encoding=\"utf-8\"?>" | |
print "<articles>" | |
# Loop through each prediction and bet URL. | |
for i in range(1,571): | |
url = "http://www.longbets.org/" + str(i) | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page) | |
# We check the title tag to see if we should continue down the page. | |
pageTitle = soup.html.head.title.renderContents().strip() | |
if pageTitle == "Long Bets - Bet Not Found": | |
pass | |
else: | |
print " <article>" | |
print " <url>%s</url>" % (url) | |
print " <comments>" | |
# Now we dive into each comment on the page and grab what we need. | |
for comment in soup.findAll('div', 'post'): | |
# The last 'post' on every page is not really a comment. Exclude it. | |
if not comment.h3.string.strip() == "Comments are temporarily closed.": | |
name = comment.div.a.renderContents().strip() | |
user_url = "http://longbets.org" + comment.div.a['href'] | |
title = comment.h3.renderContents().strip() | |
message = title + " " + comment.p.renderContents().strip() | |
date_str = str(comment.div.contents[5].string) | |
date = dparser.parse(date_str) | |
date = date.strftime('%a, %d %b %Y %H:%M:%S') | |
print " <comment>" | |
print " <name>%s</name>" % (name) | |
print " <email></email>" | |
print " <url>" + user_url + "</url>" | |
print " <ip_address></ip_address>" | |
print " <message>%s</message>" % (message) | |
print " <date>%s -0000</date>" % (date) | |
print " <points>1</points>" | |
print " </comment>" | |
print " </comments>" | |
print " </article>" | |
print "</articles>" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment