Created
September 19, 2015 07:13
-
-
Save jmoy/2071057c3c14919dc275 to your computer and use it in GitHub Desktop.
Scrape daily wheat price data using Selenium and LXML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
""" | |
Download wheat price data from http://agmarket.nic.in | |
Data is saved in file wheat.csv in current directory | |
Author: Jyotirmoy Bhattacharya, Ambedkar University, New Delhi, [email protected] | |
""" | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.select import Select | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.wait import WebDriverWait | |
import selenium.webdriver.support.expected_conditions as EC | |
import calendar | |
import time | |
import lxml.html | |
import csv | |
import sys | |
from contextlib import closing | |
def get_agmarket_data(driver,year,month,state,commodity): | |
driver.get("http://agmarknet.nic.in/agnew/NationalBEnglish/DatewiseCommodityReport.aspx?ss=2") | |
yr = Select(driver.find_element_by_id("cboYear")) | |
yr.select_by_visible_text(str(year)) | |
mn = Select(driver.find_element_by_id("cboMonth")) | |
mn.select_by_visible_text(month) | |
st = Select(driver.find_element_by_id("cboState")) | |
st.select_by_visible_text(state) | |
cm = Select(driver.find_element_by_id("cboCommodity")) | |
cm.select_by_visible_text(commodity) | |
driver.find_element_by_id("btnSubmit").click() | |
recs =WebDriverWait(driver,10).until( | |
EC.presence_of_element_located((By.ID,"gridRecords"))) | |
tbody = lxml.html.fragment_fromstring(recs.get_attribute("innerHTML")) | |
last = None | |
for row in tbody: | |
i = 0 | |
maxc = len(last) if last is not None else 0 | |
nxt = [] | |
for e in row: | |
t = e.text | |
if not t and i<maxc: | |
t = last[i] | |
nxt.append(t) | |
i += 1 | |
last = nxt | |
yield nxt | |
if __name__=="__main__": | |
with closing(webdriver.Firefox()) as driver, open("wheat.csv","w",newline='') as f: | |
driver.implicitly_wait(5) | |
writer = csv.writer(f) | |
for state in ["Punjab","Maharashtra"]: | |
for year in range(2010,2016): | |
for month in calendar.month_name[1:]: | |
try: | |
res = "Success" | |
writer.writerows(row | |
for row in get_agmarket_data(driver, | |
year,month,state,"Wheat") | |
if row[0]!="Market" #Header row | |
) | |
except NoSuchElementException: | |
res = "Failed" | |
print("{}: {}, {}, {}".format(res,state,year,month),file=sys.stderr) | |
time.sleep(5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment