Created
October 19, 2011 19:42
-
-
Save neilkod/1299438 to your computer and use it in GitHub Desktop.
steve job tribute scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# downloaded messages can be found | |
# at http://www.neilkodner.com/stevejobs_tribute.txt | |
#!/usr/bin/python | |
import urllib2 | |
import simplejson as json | |
import time | |
import codecs | |
# scrapes messages from http://www.apple.com/stevejobs/ | |
# keys are [u'header', u'mainText', u'location', u'author'] | |
# but not all fields are always present (or interesting) | |
output_file = 'stevejobs_tribute.txt' | |
# strip linefeeds and tabs | |
def clean(txt): | |
return txt.replace('\n','').replace('\t','') | |
url="http://www.apple.com/stevejobs/messages/%d.json" | |
file_handle = codecs.open(output_file,'w','utf-8') | |
for i in range(0,5000): | |
req = url % i | |
data = urllib2.urlopen(req).read() | |
data = json.loads(data) | |
file_handle.write(clean(data['mainText']) + '\n') | |
time.sleep(.5) | |
file_handle.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment