Skip to content

Instantly share code, notes, and snippets.

@ijharulislam
Last active September 27, 2017 04:07
Show Gist options
  • Save ijharulislam/65ba70b2f113336bcf6441c8317b306b to your computer and use it in GitHub Desktop.
Save ijharulislam/65ba70b2f113336bcf6441c8317b306b to your computer and use it in GitHub Desktop.
import time
import re
import math
import csv
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
driver = webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME)
# driver = webdriver.Chrome()
data = []
root_url = "https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea"
driver.get(root_url)
def find_number_of_items(driver):
item_number_list = []
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
print ("page count%s"%len(page1_rows))
for row in range(len(page1_rows)):
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
name = page1_rows[row].text
page1_rows[row].click()
time.sleep(10)
print ("sleeping")
page_count = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[1]').text
page_count = int(re.search(r'\d+',page_count).group())
driver.get(root_url)
item_number_list.append(page_count)
print (item_number_list)
return item_number_list
#lis = find_number_of_items(driver)
lis = [7, 25, 13, 3, 17, 2, 8, 17, 12, 17, 4, 38, 24, 7, 264, 16, 4, 0, 5, 5, 10, 3, 5, 9, 4]
def extract_detail(driver):
output = {}
try:
output["Regulatory Submission Type"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[1]/td[2]/label').text
output["Trial Title"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[2]/td[2]/label').text
output["Protocol No."] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[3]/td[2]/label').text
output["Phase"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[4]/td[2]/label').text
output["Therapeutic Area"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[5]/td[2]').text
output["Investigational Product"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[6]/td[2]/label').text
output["Local Sponsor(s)"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[7]/td[2]/label[1]').text
output["Principal Investigator"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[1]').text
output["Clinical Trial Institution"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[2]').text
output["Trial Status"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[3]').text
output["Date of authorization"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[4]').text
print("Output:", output)
data.append(output)
except:
pass
return output
def paginate():
for val in lis:
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
indx = lis.index(val)
page1_rows[indx].click()
time.sleep(10)
#for row in range(val):
for row in range(val):
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
print ("row number",row)
#this would handle the pagination
if ((row/9.0)>1):
print ("inside if")
click_nxtpage_count = int(math.floor(row/9))
print ("nxt page count",click_nxtpage_count)
for i in range(click_nxtpage_count):
driver.find_element_by_xpath("//*[@id='page']/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[2]/a[contains(text(), '[next]')]").click()
time.sleep(10)
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
row = row%9
print("Current Row in page2", row)
try:
page2_rows[row].click()
except:
driver.get(root_url)
time.sleep(10)
break
time.sleep(10)
#extract data here
extract_detail(driver)
driver.get(root_url)
if val-1 == row:
break
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
indx = lis.index(val)
page1_rows[indx].click()
time.sleep(10)
else:
print ("inside else")
try:
page2_rows[row].click()
except:
driver.get(root_url)
time.sleep(10)
break
time.sleep(10)
# Extracting Detail Data here
extract_detail(driver)
driver.get(root_url)
if val-1 == row:
break
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a")
indx = lis.index(val)
page1_rows[indx].click()
time.sleep(10)
print ("scraped amount",row)
def export_csv():
with open('ExtractedData.csv', 'w') as f:
dict_writer = csv.DictWriter(f, data[0].keys())
dict_writer.writeheader()
dict_writer.writerows(data)
if __name__ == "__main__":
try:
paginate()
export_csv()
except:
export_csv()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment