k-nut · July 10, 2014 07:33
diff --git a/scraper.py b/scraper.py
 from datetime import date
 import json

 from datetime import date
 from bs4 import BeautifulSoup
 import requests
 import turbotlib

 def extractData(link):
    source_url = link
    sample_date = str(date.today())
    tables = getTables(link)
    record = {}
    for table in tables:
        for row in table.find_all('tr'):
            tds = row.find_all('td')
            if len(tds) < 3:
                continue
            key = tds[0].text.strip().replace(' ', '_')
            value = tds[2].text.strip()
            record[key] = value
    record['sample_date'] = sample_date
    record['source_url'] = source_url
    print(json.dumps(record))

 def getTables(link):
    response = requests.get(link)
    html = response.content
    doc = BeautifulSoup(html)
    tables = doc.find_all('table')
    return tables


 def main():
    baseUrl = "https://masnetsvc.mas.gov.sg"
    source_url = baseUrl + "/fin/findir/SDWFIDIR.NSF/All+Institutions?OpenView"
    turbotlib.log("Starting scrape...")
    response = requests.get(source_url)
    html = response.content
    doc = BeautifulSoup(html)
    tables = doc.find_all('table')
    maintable = tables[-3]
    for tr in maintable.find_all('tr'):
        tds = tr.find_all('td')
        if len(tds) > 0:
            link = tds[1].find('a')['href']
            extractData(baseUrl + link)

 if __name__ == "__main__":
    main()
	from datetime import date
	import json

	from datetime import date
	from bs4 import BeautifulSoup
	import requests
	import turbotlib

	def extractData(link):
	source_url = link
	sample_date = str(date.today())
	tables = getTables(link)
	record = {}
	for table in tables:
	for row in table.find_all('tr'):
	tds = row.find_all('td')
	if len(tds) < 3:
	continue
	key = tds[0].text.strip().replace(' ', '_')
	value = tds[2].text.strip()
	record[key] = value
	record['sample_date'] = sample_date
	record['source_url'] = source_url
	print(json.dumps(record))

	def getTables(link):
	response = requests.get(link)
	html = response.content
	doc = BeautifulSoup(html)
	tables = doc.find_all('table')
	return tables


	def main():
	baseUrl = "https://masnetsvc.mas.gov.sg"
	source_url = baseUrl + "/fin/findir/SDWFIDIR.NSF/All+Institutions?OpenView"
	turbotlib.log("Starting scrape...")
	response = requests.get(source_url)
	html = response.content
	doc = BeautifulSoup(html)
	tables = doc.find_all('table')
	maintable = tables[-3]
	for tr in maintable.find_all('tr'):
	tds = tr.find_all('td')
	if len(tds) > 0:
	link = tds[1].find('a')['href']
	extractData(baseUrl + link)

	if __name__ == "__main__":
	main()