Skip to content

Instantly share code, notes, and snippets.

@neilkod
Created October 19, 2011 19:42
Show Gist options
  • Save neilkod/1299438 to your computer and use it in GitHub Desktop.
Save neilkod/1299438 to your computer and use it in GitHub Desktop.
steve job tribute scraper
# downloaded messages can be found
# at http://www.neilkodner.com/stevejobs_tribute.txt
#!/usr/bin/python
import urllib2
import simplejson as json
import time
import codecs
# scrapes messages from http://www.apple.com/stevejobs/
# keys are [u'header', u'mainText', u'location', u'author']
# but not all fields are always present (or interesting)
output_file = 'stevejobs_tribute.txt'
# strip linefeeds and tabs
def clean(txt):
return txt.replace('\n','').replace('\t','')
url="http://www.apple.com/stevejobs/messages/%d.json"
file_handle = codecs.open(output_file,'w','utf-8')
for i in range(0,5000):
req = url % i
data = urllib2.urlopen(req).read()
data = json.loads(data)
file_handle.write(clean(data['mainText']) + '\n')
time.sleep(.5)
file_handle.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment