Skip to content

Instantly share code, notes, and snippets.

@nishanttotla
Created April 11, 2017 06:51
Show Gist options
  • Save nishanttotla/5043b08b33140ab664a3b3bc9ddd9a53 to your computer and use it in GitHub Desktop.
Save nishanttotla/5043b08b33140ab664a3b3bc9ddd9a53 to your computer and use it in GitHub Desktop.
Python example to scrape web data
# Source: https://stackoverflow.com/questions/22018003/converting-html-table-to-csv-file-from-shell
# Reference: https://stackoverflow.com/questions/5214578/python-print-string-to-text-file
from BeautifulSoup import BeautifulSoup
import csv
import requests
outputfile = "allthedata.csv"
csvfile = open(outputfile, 'wb')
fout = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
baseUrl = "http://omms.nic.in/StateProfile/StateProfile/HabitationPopulationStatus/"
tablecount = -1
for year_base in range(17):
for statecode_base in range(36):
for populationCode in [1, 2, 4, 5]:
for upgradeStatus in ["N", "U"]:
year = year_base + 2000
statecode = statecode_base + 1
url = baseUrl + str(populationCode) + "$" + str(statecode) + "$0$0$2$" + str(year) + "$0$" + upgradeStatus + "$R$0$Y$0$0"
print "curling " + str(populationCode) + " " + str(statecode) + " " + str(year) + " " + upgradeStatus
htmltext = requests.get(url).text
print "Parsing htmltext"
soup = BeautifulSoup(htmltext,convertEntities=BeautifulSoup.HTML_ENTITIES)
print "Preemptively removing unnecessary tags"
[s.extract() for s in soup('script')]
print "writing to CSV"
for table in soup.findAll("table"):
tablecount += 1
print "Processing Table #%d" % (tablecount)
for row in table.findAll('tr'):
cols = row.findAll(['td'])
if cols:
cols = [x.text for x in cols]
cols.append(year)
cols.append(statecode)
cols.append(populationCode)
cols.append(upgradeStatus)
fout.writerow(cols)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment