Created
March 12, 2014 10:58
-
-
Save philshem/9504719 to your computer and use it in GitHub Desktop.
Wayback Machine: finds historical New York Times front pages that include a certain string, and prints a link to those archived pages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
from bs4 import BeautifulSoup | |
site = 'nytimes.com' | |
for year in xrange(2010,2014+1): | |
for month in xrange(1,12+1): | |
url = 'http://archive.org/wayback/available?url='+site+'×tamp='+str(year)+str(month).zfill(2)+str('01') | |
r = requests.get(url) | |
data = json.loads(r.text) | |
newurl = data['archived_snapshots']['closest']['url'] | |
p = requests.get(newurl) | |
soup = BeautifulSoup(p.text) | |
summary = [h.text for h in soup.findAll('p', attrs={'class': 'summary'})] + [h.text for h in soup.findAll('dd', attrs={'class': 'summary'})] | |
matching = [s for s in summary if 'Obama' in s] | |
for item in matching: | |
print month,year,newurl,item.replace('\r\n','').encode('utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment