Skip to content

Instantly share code, notes, and snippets.

@shafayeatsumit
Created September 20, 2017 15:20
Show Gist options
  • Save shafayeatsumit/a23a985c25d45c0d3dee77673720c868 to your computer and use it in GitHub Desktop.
Save shafayeatsumit/a23a985c25d45c0d3dee77673720c868 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import re
#driver = webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME)
driver = webdriver.Firefox()
print("Fetching Data...")
driver.get("https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea")
def extract_detail(driver):
pass
#return extracted data
track = [0,0,0]
#track [0] - home page; track[1] - second level page ; track[2] - pagination in second level
table = driver.find_element_by_xpath("//table[@class='fmTbl']")
table_rows = table.find_elements_by_xpath(".//tr/td/a")
for row in table_rows:
table = driver.find_element_by_xpath("//table[@class='fmTbl']")
table_rows = table.find_elements_by_xpath(".//tr/td/a")
table_rows[track[0]].click()
time.sleep(10)
page_count = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[1]').text
page_count = int(re.search(r'\d+',page_count).group())
number_of_clicks = int(page_count/10)
print("page count",page_count) #25
if (page_count > 10 and track[2]>0 ):
if (track[2]< number_of_clicks):
driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[2]/a[1]').click()
time.sleep(10)
table_second_level = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody')
table_second_rows = driver.find_elements_by_xpath('.//tr/td/a')
if(track[1]<=10):
table_second_rows[track[1]].click()
print ("end of cycle",e)
track[1] = track[1] + 1
if(track[1] == 10):
track[1] = 0
time.sleep(10)
#need to handle pagination
# going to last page (detail parsing)
data = extract_detail(driver)
driver.get("https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea")
else:
track[0] = track[0] + 1
else:
table_second_level = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody')
table_second_rows = driver.find_elements_by_xpath('.//tr/td/a')
if(track[1]<=10):
try:
table_second_rows[track[1]].click()
except Exception as e:
print ("ending of the cycle",e )
track[0] = track[0] + 1
track[1] = track[1] + 1
time.sleep(10)
data = extract_detail(driver)
#need to handle pagination
# going to last page (detail parsing)
driver.get("https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea")
print("track value before",track[0])
#track[0] = track[0] + 1s
print("track value after",track[0])
if (page_count>10):
track[2] = track[2] + 1
# table = driver.find_element_by_xpath("//table[@class='fmTbl']")
# for row in table.find_elements_by_xpath(".//tr/td/a"):
# row.click()
# time.sleep(20)
# #get table element in second level
# table_second_level = driver.find_elements_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody')[0]
# print(table_second_level)
# for row in table_second_level.find_elements_by_xpath(".//tr/td/a"):
# row.click()
# time.sleep(10)
# driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[5]/tbody/tr/td/div/a/b').click()
# time.sleep(10)
# driver.clos()
@shafayeatsumit
Copy link
Author

from selenium import webdriver
import time
import re
import math

driver = webdriver.Firefox()
root_url = "https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea"
driver.get(root_url)
page1_rows = driver.find_elements_by_xpath("//table[@Class='fmTbl']/tbody/tr/td/a")

def find_number_of_items(driver):

item_number_list = []

for row in range(len(page1_rows)):

items_by_categ = {}

page1_rows = driver.find_elements_by_xpath("//table[@Class='fmTbl']/tbody/tr/td/a")

name = page1_rows[row].text

page1_rows[row].click()

time.sleep(10)

page_count = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[1]').text

page_count = int(re.search(r'\d+',page_count).group())

items_by_categ[name] = page_count

driver.get(root_url)

print items_by_categ

item_number_list.append(items_by_categ)

return item_number_list

lis = find_number_of_items(driver)

lis = [7,25,13,17]

for val in lis[1:]:
page1_rows = driver.find_elements_by_xpath("//table[@Class='fmTbl']/tbody/tr/td/a")
indx = lis.index(val)
page1_rows[indx].click()
time.sleep(10)
#for row in range(val):
for row in range(15,25):
page2_rows = driver.find_elements_by_xpath("//table[@Class='fmTbl']/tbody/tr/td/a")
print "row number",row
#this would handle the pagination
if ((row/9.0)>1):
print "inside if"
click_nxtpage_count = int(math.ceil(row/9))
print "nxt page count",click_nxtpage_count
for i in range(click_nxtpage_count):
#need to change the next page clickable
#driver.find_elements_by_xpath("//[contains(text(), 'My Button')]")
driver.find_element_by_xpath('//
[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[2]/a[1]').click()
time.sleep(10)
page2_rows = driver.find_elements_by_xpath("//table[@Class='fmTbl']/tbody/tr/td/a")
row = row%9
page2_rows[row].click()
time.sleep(10)
driver.get(root_url)
if val-1 == row:
break
page1_rows = driver.find_elements_by_xpath("//table[@Class='fmTbl']/tbody/tr/td/a")
indx = lis.index(val)
page1_rows[indx].click()
time.sleep(10)

    else:
        print "inside else"
        page2_rows[row].click()
        time.sleep(10)        
        driver.get(root_url)
        if val-1 == row:
            break
        page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")        
        indx = lis.index(val)
        page1_rows[indx].click()
        time.sleep(10)
        print "scraped amount",row

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment