mmccollow · February 3, 2012 02:04
diff --git a/gistfile1.py b/gistfile1.py
 import urllib2
 import HTMLParser
 from BeautifulSoup import BeautifulSoup
 import time

 def _fetch_page(url):
  try:
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
  except urllib2.URLError:
    print "Failed to fetch page from: " + url
  except HTMLParser.HTMLParseError:
    print "Failed to parse page from: " + url
  return soup

 def fetch_reddit(dateStr=None):
  if dateStr == None:
    return _fetch_page('http://www.reddit.com')
  else:
    wayback_url = 'http://web.archive.org/web/' + dateStr + '/http://reddit.com/?'
    return _fetch_page(wayback_url)

 def get_valid_wayback_dates(year):
  dates = []
  page = _fetch_page('http://wayback.archive.org/web/' + year + '*/http://www.reddit.com')
  divs = page.findAll('div')
  for d in divs:
    if d.has_key('class'):
      if d['class'].strip() == 'position':
        dateTag = d.findChildren('div')[1]
        dates.append(time.strftime("%Y%m%d", time.strptime(dateTag['id'], "%b-%d-%Y")))
  return dates

 def get_titles(page):
  titles = []
  anchors = page.findAll('a')
  for a in anchors:
    if a.has_key('class'):
      if a['class'].strip() == 'title':
        titles.append(a.contents[0])
  return titles

 def get_words(titles):
  words = []
  for t in titles:
    words += [w for w in t.split()]
  return words

 def lex_div(words):
  return 1.0 * len(set(words)) / len(words)

 def generate_datfile():
  """
  To use the file this function creates, start GNUplot and run the following command:
  plot "reddit.dat" using 1:2 title "Lexical Diversity" with lines
  """
  years = ['2006', '2007', '2008', '2009', '2010', '2011']
  frontpages = []
  output = []
  for year in years:
    dates = get_valid_wayback_dates(year)
    frontpages.append((year, fetch_reddit(dates[0])))
  frontpages.append(('2012', fetch_reddit())) #current frontpage
  for page in frontpages:
    words = get_words(get_titles(page[1]))
    output.append((page[0], lex_div(words)))
  fp = open("reddit.dat", "w+")
  for item in output:
    fp.write(item[0] + "\t" + str(item[1]) + "\n")
  fp.close()
	import urllib2
	import HTMLParser
	from BeautifulSoup import BeautifulSoup
	import time

	def _fetch_page(url):
	try:
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)
	except urllib2.URLError:
	print "Failed to fetch page from: " + url
	except HTMLParser.HTMLParseError:
	print "Failed to parse page from: " + url
	return soup

	def fetch_reddit(dateStr=None):
	if dateStr == None:
	return _fetch_page('http://www.reddit.com')
	else:
	wayback_url = 'http://web.archive.org/web/' + dateStr + '/http://reddit.com/?'
	return _fetch_page(wayback_url)

	def get_valid_wayback_dates(year):
	dates = []
	page = _fetch_page('http://wayback.archive.org/web/' + year + '*/http://www.reddit.com')
	divs = page.findAll('div')
	for d in divs:
	if d.has_key('class'):
	if d['class'].strip() == 'position':
	dateTag = d.findChildren('div')[1]
	dates.append(time.strftime("%Y%m%d", time.strptime(dateTag['id'], "%b-%d-%Y")))
	return dates

	def get_titles(page):
	titles = []
	anchors = page.findAll('a')
	for a in anchors:
	if a.has_key('class'):
	if a['class'].strip() == 'title':
	titles.append(a.contents[0])
	return titles

	def get_words(titles):
	words = []
	for t in titles:
	words += [w for w in t.split()]
	return words

	def lex_div(words):
	return 1.0 * len(set(words)) / len(words)

	def generate_datfile():
	"""
	To use the file this function creates, start GNUplot and run the following command:
	plot "reddit.dat" using 1:2 title "Lexical Diversity" with lines
	"""
	years = ['2006', '2007', '2008', '2009', '2010', '2011']
	frontpages = []
	output = []
	for year in years:
	dates = get_valid_wayback_dates(year)
	frontpages.append((year, fetch_reddit(dates[0])))
	frontpages.append(('2012', fetch_reddit())) #current frontpage
	for page in frontpages:
	words = get_words(get_titles(page[1]))
	output.append((page[0], lex_div(words)))
	fp = open("reddit.dat", "w+")
	for item in output:
	fp.write(item[0] + "\t" + str(item[1]) + "\n")
	fp.close()