Skip to content

Instantly share code, notes, and snippets.

@icohen
Created November 18, 2011 18:49
Show Gist options
  • Save icohen/1377373 to your computer and use it in GitHub Desktop.
Save icohen/1377373 to your computer and use it in GitHub Desktop.
Boston Globe - Today's Paper Scraper
import urllib2
import json
from BeautifulSoup import BeautifulSoup
import re
import os
sections = []
today = urllib2.urlopen("http://www.bostonglobe.com/todayspaper/2011/11/05")
today_soup = BeautifulSoup(today)
for section_tag in today_soup.findAll('h2', {'class':'hed-section'}):
section = {}
section['name'] = section_tag.a.text
section['url'] = 'http://www.bostonglobe.com' + section_tag.a['href']
section_page = urllib2.urlopen("http://www.bostonglobe.com/todayspaper/2011/11/05")
section_soup = BeautifulSoup(section_page)
stories = []
for story_tag in section_soup.findAll('h3', {'class':'story-title'}):
if not story_tag.a:
continue
story = {}
story['title'] = story_tag.a.text
story['url'] = story_tag.a['href']
full_url = "http://www.bostonglobe.com" + story['url']
story_req = urllib2.Request(full_url)
story_req.add_header('Cookie', 'pathAuth=e0ce3a19-2259-4aea-8aa1-d06b29e7d3bb')
story_page = urllib2.urlopen(story_req).read()
story_soup = BeautifulSoup(story_page)
lead_figure_div = story_soup.find('div', {'class':'figure'})
print lead_figure_div
if lead_figure_div:
lead_figure = lead_figure_div.find('img')['src']
story['image'] = os.path.basename(lead_figure)
print story['image']
os.system('cd images; curl -O -b "pathAuth=e0ce3a19-2259-4aea-8aa1-d06b29e7d3bb"' + story['image'])
stories.append(story)
section['stories'] = stories
sections.append(section)
break
# for i, section in enumerate(data['sections']):
# print i+1, section['name']
print json.dumps(sections)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment