Created
February 3, 2012 02:04
-
-
Save mmccollow/1727195 to your computer and use it in GitHub Desktop.
Reddit historical lexical diversity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import HTMLParser | |
from BeautifulSoup import BeautifulSoup | |
import time | |
def _fetch_page(url): | |
try: | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page) | |
except urllib2.URLError: | |
print "Failed to fetch page from: " + url | |
except HTMLParser.HTMLParseError: | |
print "Failed to parse page from: " + url | |
return soup | |
def fetch_reddit(dateStr=None): | |
if dateStr == None: | |
return _fetch_page('http://www.reddit.com') | |
else: | |
wayback_url = 'http://web.archive.org/web/' + dateStr + '/http://reddit.com/?' | |
return _fetch_page(wayback_url) | |
def get_valid_wayback_dates(year): | |
dates = [] | |
page = _fetch_page('http://wayback.archive.org/web/' + year + '*/http://www.reddit.com') | |
divs = page.findAll('div') | |
for d in divs: | |
if d.has_key('class'): | |
if d['class'].strip() == 'position': | |
dateTag = d.findChildren('div')[1] | |
dates.append(time.strftime("%Y%m%d", time.strptime(dateTag['id'], "%b-%d-%Y"))) | |
return dates | |
def get_titles(page): | |
titles = [] | |
anchors = page.findAll('a') | |
for a in anchors: | |
if a.has_key('class'): | |
if a['class'].strip() == 'title': | |
titles.append(a.contents[0]) | |
return titles | |
def get_words(titles): | |
words = [] | |
for t in titles: | |
words += [w for w in t.split()] | |
return words | |
def lex_div(words): | |
return 1.0 * len(set(words)) / len(words) | |
def generate_datfile(): | |
""" | |
To use the file this function creates, start GNUplot and run the following command: | |
plot "reddit.dat" using 1:2 title "Lexical Diversity" with lines | |
""" | |
years = ['2006', '2007', '2008', '2009', '2010', '2011'] | |
frontpages = [] | |
output = [] | |
for year in years: | |
dates = get_valid_wayback_dates(year) | |
frontpages.append((year, fetch_reddit(dates[0]))) | |
frontpages.append(('2012', fetch_reddit())) #current frontpage | |
for page in frontpages: | |
words = get_words(get_titles(page[1])) | |
output.append((page[0], lex_div(words))) | |
fp = open("reddit.dat", "w+") | |
for item in output: | |
fp.write(item[0] + "\t" + str(item[1]) + "\n") | |
fp.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment