Chris Essig csessig86

csessig86 / Timeline.py part 1

Created February 20, 2012 20:19

Timeline.py part 1

	import urllib2
	from BeautifulSoup import BeautifulSoup
	import datetime
	import re

	now = datetime.datetime.now()

	# Create a CSV where we'll save our data. See further docs:
	# http://propublica.github.com/timeline-setter/#csv
	f = open('timeline.csv', 'w')

csessig86 / Turbine_JS

Created February 28, 2012 16:18

csessig86 / gist:1959531

Created March 2, 2012 16:38

Timeline.py part 2

	events = soup.findAll('div', attrs={'class': 'story-block'})
	for x in events:

csessig86 / gist:1959618

Created March 2, 2012 16:52

Timeline.py part 3

	date = x.find('p', attrs={'class': 'story-more'})('em')
	link = x.find('fb:like')['href']
	headline = x.find('h3').text
	description = x.find('div', attrs={'id': 'blox-story-text'})('p', limit=1)
	image = x.find('img')

csessig86 / gist:1959692

Created March 2, 2012 17:06

Timeline.py part 4

	# Information on the page that we will scrape
	date = x.find('p', attrs={'class': 'story-more'})('em')
	link = x.find('fb:like')['href']
	headline = x.find('h3').text
	description = x.find('div', attrs={'id': 'blox-story-text'})('p', limit=1)
	image = x.find('img')

csessig86 / gist:1959704

Created March 2, 2012 17:10

Timeline.py part 5

	# Extract that information in strings
	date2 = str(date)
	link2 = str(link)
	headline2 = str(headline)
	image2 = str(image)
	description2 = str(description)

csessig86 / gist:2046673

Created March 15, 2012 20:25

Timeline.py part 6

	# Extra formatting needed for dates to get rid of em tags and unnecessary formatting
	date4 = date3.replace('[<em>', "")
	date5 = date4.replace('</em>]', "")
	date6 = date5.replace('- ', "")
	date7 = date6.replace("at ", "")

	# Extra formatting is also need for the description to get rid of p tags and new line returns
	description4 = description3.replace('[<p>', "")
	description5 = description4.replace('</p>]', "")
	description6 = description5.replace('\n', " ")

csessig86 / gist:2046704

Created March 15, 2012 20:30

Timeline.py part 7

	# We will adjust the width of all images to 300 pixels. Also, Python spits out the word 'None' if it doesn't find an image. Delete that.
	image4 = re.sub(r'width="\d\d\d"', 'width="300"', image3)
	image5 = image4.replace('None', "")

csessig86 / gist:2046741

Created March 15, 2012 20:38

Timeline.py part 8

	# If the story has been updated recently, an em class tag will appear on the page showing the time but not the date. We will delete the class and replace it with today's date. We can change the date in the CSV if we need to.
	date8 = date7.replace('[<em class="item-updated badge">Updated:', str(now.strftime("%Y-%m-%d %H:%M")))

csessig86 / gist:2046782

Created March 15, 2012 20:45

Timeline.py part 9

	import urllib2
	from BeautifulSoup import BeautifulSoup
	import datetime
	import re

	now = datetime.datetime.now()