icohen · November 18, 2011 18:49
diff --git a/gistfile1.txt b/gistfile1.txt
 import urllib2
 import json
 from BeautifulSoup import BeautifulSoup
 import re
 import os 

 sections = []

 today = urllib2.urlopen("http://www.bostonglobe.com/todayspaper/2011/11/05")
 today_soup = BeautifulSoup(today)
 for section_tag in today_soup.findAll('h2', {'class':'hed-section'}):
 	section = {}
 	section['name'] = section_tag.a.text
 	section['url'] = 'http://www.bostonglobe.com' + section_tag.a['href']
 	
 	section_page = urllib2.urlopen("http://www.bostonglobe.com/todayspaper/2011/11/05")
 	section_soup = BeautifulSoup(section_page)
 	stories = []
 	for story_tag in section_soup.findAll('h3', {'class':'story-title'}):
 		if not story_tag.a:
 			continue
 		story = {}
 		story['title'] = story_tag.a.text
 		story['url'] = story_tag.a['href']
 		full_url = "http://www.bostonglobe.com" + story['url']
 		story_req = urllib2.Request(full_url)
 		story_req.add_header('Cookie', 'pathAuth=e0ce3a19-2259-4aea-8aa1-d06b29e7d3bb')
 		story_page = urllib2.urlopen(story_req).read()
 		story_soup = BeautifulSoup(story_page)
 		lead_figure_div = story_soup.find('div', {'class':'figure'})
 		print lead_figure_div
 		if lead_figure_div:
 			lead_figure = lead_figure_div.find('img')['src']
 			story['image'] = os.path.basename(lead_figure)
 			print story['image']
 			os.system('cd images; curl -O -b "pathAuth=e0ce3a19-2259-4aea-8aa1-d06b29e7d3bb"' + story['image'])
 		stories.append(story)
 	section['stories'] = stories
 	sections.append(section)
 	break
 # for i, section in enumerate(data['sections']):
 # 	print i+1, section['name']
 print json.dumps(sections)
	import urllib2
	import json
	from BeautifulSoup import BeautifulSoup
	import re
	import os

	sections = []

	today = urllib2.urlopen("http://www.bostonglobe.com/todayspaper/2011/11/05")
	today_soup = BeautifulSoup(today)
	for section_tag in today_soup.findAll('h2', {'class':'hed-section'}):
	section = {}
	section['name'] = section_tag.a.text
	section['url'] = 'http://www.bostonglobe.com' + section_tag.a['href']

	section_page = urllib2.urlopen("http://www.bostonglobe.com/todayspaper/2011/11/05")
	section_soup = BeautifulSoup(section_page)
	stories = []
	for story_tag in section_soup.findAll('h3', {'class':'story-title'}):
	if not story_tag.a:
	continue
	story = {}
	story['title'] = story_tag.a.text
	story['url'] = story_tag.a['href']
	full_url = "http://www.bostonglobe.com" + story['url']
	story_req = urllib2.Request(full_url)
	story_req.add_header('Cookie', 'pathAuth=e0ce3a19-2259-4aea-8aa1-d06b29e7d3bb')
	story_page = urllib2.urlopen(story_req).read()
	story_soup = BeautifulSoup(story_page)
	lead_figure_div = story_soup.find('div', {'class':'figure'})
	print lead_figure_div
	if lead_figure_div:
	lead_figure = lead_figure_div.find('img')['src']
	story['image'] = os.path.basename(lead_figure)
	print story['image']
	os.system('cd images; curl -O -b "pathAuth=e0ce3a19-2259-4aea-8aa1-d06b29e7d3bb"' + story['image'])
	stories.append(story)
	section['stories'] = stories
	sections.append(section)
	break
	# for i, section in enumerate(data['sections']):
	# print i+1, section['name']
	print json.dumps(sections)