Skip to content

Instantly share code, notes, and snippets.

@k-nut
Created July 10, 2014 07:33
Show Gist options
  • Save k-nut/19104da704abf7c3f330 to your computer and use it in GitHub Desktop.
Save k-nut/19104da704abf7c3f330 to your computer and use it in GitHub Desktop.
from datetime import date
import json
from datetime import date
from bs4 import BeautifulSoup
import requests
import turbotlib
def extractData(link):
source_url = link
sample_date = str(date.today())
tables = getTables(link)
record = {}
for table in tables:
for row in table.find_all('tr'):
tds = row.find_all('td')
if len(tds) < 3:
continue
key = tds[0].text.strip().replace(' ', '_')
value = tds[2].text.strip()
record[key] = value
record['sample_date'] = sample_date
record['source_url'] = source_url
print(json.dumps(record))
def getTables(link):
response = requests.get(link)
html = response.content
doc = BeautifulSoup(html)
tables = doc.find_all('table')
return tables
def main():
baseUrl = "https://masnetsvc.mas.gov.sg"
source_url = baseUrl + "/fin/findir/SDWFIDIR.NSF/All+Institutions?OpenView"
turbotlib.log("Starting scrape...")
response = requests.get(source_url)
html = response.content
doc = BeautifulSoup(html)
tables = doc.find_all('table')
maintable = tables[-3]
for tr in maintable.find_all('tr'):
tds = tr.find_all('td')
if len(tds) > 0:
link = tds[1].find('a')['href']
extractData(baseUrl + link)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment