Created
September 21, 2017 09:36
-
-
Save ijharulislam/d93d00744d09aeb90b3471ea7a743949 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import time | |
import re | |
import csv | |
import math | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
# driver = webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) | |
driver = webdriver.Chrome() | |
data = [] | |
root_url = "https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea" | |
driver.get(root_url) | |
def find_number_of_items(driver): | |
item_number_list = [] | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
print ("page count%s"%len(page1_rows)) | |
for row in range(len(page1_rows)): | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
name = page1_rows[row].text | |
page1_rows[row].click() | |
time.sleep(10) | |
print ("sleeping") | |
page_count = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[1]').text | |
page_count = int(re.search(r'\d+',page_count).group()) | |
driver.get(root_url) | |
item_number_list.append(page_count) | |
print (item_number_list) | |
return item_number_list | |
#lis = find_number_of_items(driver) | |
lis = [7, 25, 13, 3, 17, 2, 8, 17, 12, 17, 4, 38, 24, 7, 264, 16, 4, 0, 5, 5, 10, 3, 5, 9, 4] | |
def extract_detail(driver): | |
output = {} | |
output["Regulatory Submission Type"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[1]/td[2]/label').text | |
output["Trial Title"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[2]/td[2]/label').text | |
output["Protocol No."] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[3]/td[2]/label').text | |
output["Phase"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[4]/td[2]/label').text | |
output["Therapeutic Area"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[5]/td[2]').text | |
output["Investigational Product"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[6]/td[2]/label').text | |
output["Local Sponsor(s)"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[7]/td[2]/label[1]').text | |
output["Principal Investigator"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[1]').text | |
output["Clinical Trial Institution"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[2]').text | |
output["Trial Status"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[3]').text | |
output["Date of authorization"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[4]').text | |
print("Output:", output) | |
data.append(output) | |
return output | |
def paginate(): | |
for val in lis: | |
print ("in the first level") | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
for row in range(val): | |
#for row in range(13,val+1): | |
print ("looping secnd level") | |
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
print ("row number ==>",row) | |
print ("value ===>", val) | |
#this would handle the pagination | |
if ((row/9.0)>1): | |
print ("inside if") | |
click_nxtpage_count = int(math.ceil(row/9)) | |
print ("nxt page count",click_nxtpage_count) | |
for i in range(click_nxtpage_count): | |
driver.find_element_by_xpath("//*[@id='page']/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[2]/a[contains(text(), '[next]')]").click() | |
time.sleep(10) | |
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
#row = row%10 | |
print ("value number",val) | |
print ("row number", row) | |
row_indx = (row%10)-1 | |
page2_rows[row_indx].click() | |
time.sleep(10) | |
#extract data here | |
extract_detail(driver) | |
driver.get(root_url) | |
if val == row: | |
print ("this would break now") | |
break | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
print ("going root page agin") | |
time.sleep(10) | |
else: | |
print ("inside else") | |
page2_rows[row].click() | |
# extract detail here | |
extract_detail(driver) | |
print ("this is detail",row) | |
time.sleep(10) | |
driver.get(root_url) | |
if val-1 == row: | |
break | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
def export_csv(): | |
with open('ExtractedData.csv', 'w') as f: | |
dict_writer = csv.DictWriter(f, data[0].keys()) | |
dict_writer.writeheader() | |
dict_writer.writerows(data) | |
if __name__ == "__main__": | |
try: | |
paginate() | |
export_csv() | |
except Exception as e: | |
print(e) | |
export_csv() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment