Skip to content

Instantly share code, notes, and snippets.

@kizernis
Last active May 22, 2019 00:08
Show Gist options
  • Save kizernis/726ae7054f92a6e1885db7c57b6acd0d to your computer and use it in GitHub Desktop.
Save kizernis/726ae7054f92a6e1885db7c57b6acd0d to your computer and use it in GitHub Desktop.
# Import data from a dynamic page and save it to the Excel file
import os
import xlsxwriter
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementNotVisibleException
# Change to False to see the Chrome window
browser_is_headless = True
output_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'output.xlsx')
o = webdriver.ChromeOptions()
o.add_argument('--log-level=3')
o.add_argument('--disable-infobars')
if browser_is_headless:
o.add_argument('--headless')
o.add_argument('--disable-gpu')
# The website works only in Israel (http://www.gatherproxy.com/proxylist/country/?c=Israel)
# o.add_argument('--proxy-server=192.116.49.15:80')
driver = webdriver.Chrome(options=o)
driver.get('https://shirathayam.m-datit.org.il/PirsumNisuin')
button_more = WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, '//a[@ng-click="seeMore()" and @class="btn-blue"]')))
if not browser_is_headless:
driver.switch_to.window(driver.current_window_handle)
while True:
try:
button_more.click()
except ElementNotVisibleException:
break
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
workbook = xlsxwriter.Workbook(output_file_path)
worksheet = workbook.add_worksheet()
# Write headers
header_format = workbook.add_format({'bold': True, 'border': True, 'align': 'center', 'valign': 'vcenter', 'fg_color': '#F2F2F2'})
worksheet.merge_range('B1:F1', 'פרטי הכלה', header_format)
worksheet.merge_range('G1:K1', 'פרטי החתן', header_format)
worksheet.write('A1', 'לשכת רישום נישואין', header_format)
worksheet.write('A2', '', header_format)
worksheet.write('B2', 'ישוב', header_format)
worksheet.write('C2', 'ארץ לידה', header_format)
worksheet.write('D2', 'שם האב', header_format)
worksheet.write('E2', 'מצב אישי', header_format)
worksheet.write('F2', '', header_format)
worksheet.write('G2', 'ישוב', header_format)
worksheet.write('H2', 'ארץ לידה', header_format)
worksheet.write('I2', 'שם האב', header_format)
worksheet.write('J2', 'מצב אישי', header_format)
worksheet.write('K2', '', header_format)
# Write contents
for row_index, row in enumerate(soup.find_all('div', class_='row table-body ng-scope'), start=2):
cells = row.find_all(class_='ng-binding')
assert len(cells) == 11
cells = list(s.text for s in cells)
cells.reverse()
for column_index, text in enumerate(cells):
if ':' in text:
text = text.split(':')[1].strip()
worksheet.write(row_index, column_index, text)
workbook.close()
# Adjust columns width (works under Windows only)
if os.name == 'nt':
from win32com.client import Dispatch
excel = Dispatch('Excel.Application')
wb = excel.Workbooks.Open(output_file_path)
excel.ActiveSheet.Columns.AutoFit()
wb.Save()
wb.Close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment