Skip to content

Instantly share code, notes, and snippets.

@jmoy
Created September 19, 2015 07:13
Show Gist options
  • Save jmoy/2071057c3c14919dc275 to your computer and use it in GitHub Desktop.
Save jmoy/2071057c3c14919dc275 to your computer and use it in GitHub Desktop.
Scrape daily wheat price data using Selenium and LXML
#!/usr/bin/python3
"""
Download wheat price data from http://agmarket.nic.in
Data is saved in file wheat.csv in current directory
Author: Jyotirmoy Bhattacharya, Ambedkar University, New Delhi, [email protected]
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
import calendar
import time
import lxml.html
import csv
import sys
from contextlib import closing
def get_agmarket_data(driver,year,month,state,commodity):
driver.get("http://agmarknet.nic.in/agnew/NationalBEnglish/DatewiseCommodityReport.aspx?ss=2")
yr = Select(driver.find_element_by_id("cboYear"))
yr.select_by_visible_text(str(year))
mn = Select(driver.find_element_by_id("cboMonth"))
mn.select_by_visible_text(month)
st = Select(driver.find_element_by_id("cboState"))
st.select_by_visible_text(state)
cm = Select(driver.find_element_by_id("cboCommodity"))
cm.select_by_visible_text(commodity)
driver.find_element_by_id("btnSubmit").click()
recs =WebDriverWait(driver,10).until(
EC.presence_of_element_located((By.ID,"gridRecords")))
tbody = lxml.html.fragment_fromstring(recs.get_attribute("innerHTML"))
last = None
for row in tbody:
i = 0
maxc = len(last) if last is not None else 0
nxt = []
for e in row:
t = e.text
if not t and i<maxc:
t = last[i]
nxt.append(t)
i += 1
last = nxt
yield nxt
if __name__=="__main__":
with closing(webdriver.Firefox()) as driver, open("wheat.csv","w",newline='') as f:
driver.implicitly_wait(5)
writer = csv.writer(f)
for state in ["Punjab","Maharashtra"]:
for year in range(2010,2016):
for month in calendar.month_name[1:]:
try:
res = "Success"
writer.writerows(row
for row in get_agmarket_data(driver,
year,month,state,"Wheat")
if row[0]!="Market" #Header row
)
except NoSuchElementException:
res = "Failed"
print("{}: {}, {}, {}".format(res,state,year,month),file=sys.stderr)
time.sleep(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment