Last active
July 21, 2023 12:12
-
-
Save yumu19/26f596e4a1b5d7cd71ca812f7227ba8f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# 2020.08.17 Tsubasa Yumura 作成 | |
# 2023.07.21 Tsubasa Yumura 作成 | |
# | |
# ■これは何? | |
# EC2020実行委員(出版担当)の作業のために下記を実行するスクリプトです | |
# - 論文投稿フォームから著者情報をスクレイピングしTSVで保存 | |
# - 著作権同意のスクリーンショットを取得 | |
# | |
# ■前準備 | |
# $ pip install selenium | |
# 下記リンクよりChromeDriverをダウンロードしてスクリプトと同じディレクトリに配置 | |
# https://sites.google.com/a/chromium.org/chromedriver/home | |
# | |
# ■使い方 | |
# $ python3 ./ec_pub_scraper.py | |
# | |
# 参考 | |
# https://tanuhack.com/selenium/ | |
# https://qiita.com/shota-nekoneko/items/64bbd0c2f534d20e7b77 | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.support.select import Select | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.alert import Alert | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import TimeoutException | |
import time | |
import os | |
import re | |
import csv | |
options = Options() | |
options.add_argument('--disable-gpu'); | |
options.add_argument('--disable-extensions'); | |
options.add_argument('--proxy-server="direct://"'); | |
options.add_argument('--proxy-bypass-list=*'); | |
options.add_argument('--start-maximized'); | |
my_email = '[email protected]' | |
my_password = 'XXXXXXXXXXXXXXX' | |
DRIVER_PATH = './chromedriver' | |
SCREENSHOT_DIR = './screenshots' | |
OUTPUT_FILE = './output.tsv' | |
NUMBER = 54 #投稿件数 | |
os.makedirs(SCREENSHOT_DIR, exist_ok=True) | |
f = open(OUTPUT_FILE, 'w') | |
writer = csv.writer(f, delimiter='\t') | |
driver = webdriver.Chrome(executable_path=DRIVER_PATH, chrome_options=options) | |
url = 'https://ec2020.entcomp.org/sys/users/login' | |
driver.get(url) | |
# ログイン | |
email = driver.find_element_by_id('UserMailaddress') | |
email.send_keys(my_email) | |
password = driver.find_element_by_id('UserPassword') | |
password.send_keys(my_password) | |
time.sleep(1) | |
login_selector = 'div:nth-child(6) > div > div > input' | |
login = driver.find_element_by_css_selector(login_selector) | |
login.click() | |
time.sleep(1) | |
driver.execute_script("document.body.style.zoom='50%'") | |
w = driver.execute_script('return document.body.scrollWidth') | |
h = driver.execute_script('return document.body.scrollHeight') | |
driver.set_window_size(1024, h) | |
driver.save_screenshot(sspath) | |
print ("Save screenshot: "+sspath) | |
f.close() | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment