revox · January 13, 2014 23:05
diff --git a/scrape_ons_release_start.py b/scrape_ons_release_start.py
 '''A script to scrape the ONS release schedule'''
 import urllib
 import bs4

 page = 0
 URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"

 # open webpage
 webpage = urllib.urlopen(URL).read()

 # turn html into beautiful soup
 soup = bs4.BeautifulSoup(webpage)

 # extract info from soup
 attrs={'class':'count'}
 count_pages = soup.find('span', attrs).string


 start_of_count = count_pages.find('of')
 end_of_count = count_pages.find('|')

 # print info to screen
 pages = count_pages[start_of_count+2:end_of_count].strip()

 print 'total pages: ', int(pages)

 for page in range(0,int(pages)):
    URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"
    print URL
	'''A script to scrape the ONS release schedule'''
	import urllib
	import bs4

	page = 0
	URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"

	# open webpage
	webpage = urllib.urlopen(URL).read()

	# turn html into beautiful soup
	soup = bs4.BeautifulSoup(webpage)

	# extract info from soup
	attrs={'class':'count'}
	count_pages = soup.find('span', attrs).string


	start_of_count = count_pages.find('of')
	end_of_count = count_pages.find('\|')

	# print info to screen
	pages = count_pages[start_of_count+2:end_of_count].strip()

	print 'total pages: ', int(pages)

	for page in range(0,int(pages)):
	URL = "http://www.statistics.gov.uk/hub/release-calendar/index.html?newquery=*&newoffset=" + str(page) + "&theme=%22%22&source-agency=%22%22&uday=0&umonth=0&uyear=0&lday=-29&lmonth=0&lyear=0&coverage=%22%22&designation=&geographic-breakdown=%22%22&title=%22%22&pagetype=calendar-entry&sortBy=releaseDate&sortDirection=EITHER"
	print URL