Created
September 21, 2017 10:52
-
-
Save ijharulislam/2156d91cf49fdd37c1cc2fc1ce4ca033 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import time | |
import re | |
import math | |
import csv | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
driver = webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) | |
# driver = webdriver.Chrome() | |
data = [] | |
root_url = "https://eservice.hsa.gov.sg/prism/ct_r/enquiry.do?action=getAllTherapeuticArea" | |
driver.get(root_url) | |
def find_number_of_items(driver): | |
item_number_list = [] | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
print ("page count%s"%len(page1_rows)) | |
for row in range(len(page1_rows)): | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
name = page1_rows[row].text | |
page1_rows[row].click() | |
time.sleep(10) | |
print ("sleeping") | |
page_count = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[1]').text | |
page_count = int(re.search(r'\d+',page_count).group()) | |
driver.get(root_url) | |
item_number_list.append(page_count) | |
print (item_number_list) | |
return item_number_list | |
#lis = find_number_of_items(driver) | |
lis = [7, 25, 13, 3, 17, 2, 8, 17, 12, 17, 4, 38, 24, 7, 264, 16, 4, 0, 5, 5, 10, 3, 5, 9, 4] | |
def extract_detail(driver): | |
output = {} | |
try: | |
output["Regulatory Submission Type"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[1]/td[2]/label').text | |
output["Trial Title"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[2]/td[2]/label').text | |
output["Protocol No."] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[3]/td[2]/label').text | |
output["Phase"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[4]/td[2]/label').text | |
output["Therapeutic Area"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[5]/td[2]').text | |
output["Investigational Product"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[6]/td[2]/label').text | |
output["Local Sponsor(s)"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[3]/tbody/tr[7]/td[2]/label[1]').text | |
output["Principal Investigator"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[1]').text | |
output["Clinical Trial Institution"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[2]').text | |
output["Trial Status"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[3]').text | |
output["Date of authorization"] = driver.find_element_by_xpath('//*[@id="page"]/form/table[3]/tbody/tr/td/table[4]/tbody/tr[3]/td[4]').text | |
print("Output:", output) | |
data.append(output) | |
except: | |
pass | |
return output | |
def paginate(): | |
for val in lis: | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
#for row in range(val): | |
try: | |
for row in range(val): | |
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
print ("row number",row) | |
#this would handle the pagination | |
if ((row/9.0)>1): | |
print ("inside if") | |
click_nxtpage_count = int(math.floor(row/9)) | |
print ("nxt page count",click_nxtpage_count) | |
for i in range(click_nxtpage_count): | |
driver.find_element_by_xpath("//*[@id='page']/form/table[3]/tbody/tr/td/table[4]/tbody/tr/td[2]/a[contains(text(), '[next]')]").click() | |
time.sleep(10) | |
page2_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
row = row%9 | |
page2_rows[row].click() | |
time.sleep(10) | |
#extract data here | |
extract_detail(driver) | |
driver.get(root_url) | |
if val-1 == row: | |
break | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
else: | |
print ("inside else") | |
page2_rows[row].click() | |
time.sleep(10) | |
# Extracting Detail Data here | |
extract_detail(driver) | |
driver.get(root_url) | |
if val-1 == row: | |
break | |
page1_rows = driver.find_elements_by_xpath("//table[@class='fmTbl']/tbody/tr/td/a") | |
indx = lis.index(val) | |
page1_rows[indx].click() | |
time.sleep(10) | |
print ("scraped amount",row) | |
except Exception as e: | |
pass | |
def export_csv(): | |
with open('ExtractedData.csv', 'w') as f: | |
dict_writer = csv.DictWriter(f, data[0].keys()) | |
dict_writer.writeheader() | |
dict_writer.writerows(data) | |
if __name__ == "__main__": | |
try: | |
paginate() | |
export_csv() | |
except Exception as e: | |
print(e) | |
export_csv() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment