Created
April 15, 2017 08:20
-
-
Save xhiroga/2fedada5202c1cff56f8d1d7d2fe73cd to your computer and use it in GitHub Desktop.
use selenium to click button. acquire full website.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
class B2016Spider(scrapy.Spider): | |
name = "b2016" | |
allowed_domains = ["http://2016.spaceappschallenge.org/challenges/earth/sea-ice-app/projects"] | |
start_urls = ['http://2016.spaceappschallenge.org/challenges/earth/sea-ice-app/projects/'] | |
# 要するにdef process_requestという名前でresponseを返すクラスが登録されていればいいのだろうか | |
# selenium_middleware.pyの配置で困ったが、nasa(myProject)直下でいいらしい | |
custom_settings = { | |
"DOWNLOADER_MIDDLEWARES": { | |
"nasa.selenium_middleware.SeleniumMiddleware": 0, | |
},"DOWNLOAD_DELAY": 0.5, | |
} | |
def parse(self, response): | |
print ("****** this is response ******") | |
print (response) | |
print ("body") | |
print(response.css("body")) | |
# ここからのデータ取得がまたややこしい。 | |
# 基本的にはcssセレクタを使い、html要素からタグ属性を指定して対応する値を取得するが、 | |
# どうも、タグで挟まれている値はtext属性の値として取得するらしい。初見殺し! | |
i = 0 | |
for sell in response.css("div.containers---columnarThird---3EFIy"): | |
i = i + 1 | |
print ("Count: " + str(i)) | |
print("Title: " + sell.css("div.card---cardHeaderTitle---3q2a1::text").extract_first()) | |
print("Summary: " + sell.css("div.typography---summary---3nRFy span::text").extract_first()) | |
# タグ.クラス, タグ タグみたいに宣言する 半角スペースが意味を持っている...!!! | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os.path | |
import urllib.parse | |
from scrapy.http import HtmlResponse | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys as keys | |
from selenium.common.exceptions import NoSuchElementException # インポートしないと例外のキャッチができない | |
import time | |
driver = webdriver.PhantomJS() | |
class SeleniumMiddleware(object): | |
def process_request(self, request, spider): | |
driver.get(request.url) | |
print ("Waiting 1st Load 5 seconds") | |
time.sleep(5) | |
# for watch html file | |
# str = driver.page_source | |
# f = open('log.html', 'w') | |
# f.write(str) | |
# f.close() | |
# Load file until button dissapear... | |
click_loop = True | |
while click_loop: | |
try: | |
driver.find_element_by_css_selector("button.ui---chunkyButton---CHoDI").send_keys(keys.ENTER) | |
print ("Click and wait 3.5 second") | |
time.sleep(3.5) | |
except NoSuchElementException: | |
print ("Load finish!") | |
click_loop = False | |
str = driver.page_source | |
f = open('log.html', 'w') | |
f.write(str) | |
f.close() | |
return HtmlResponse(driver.current_url, | |
body = driver.page_source, | |
encoding = 'utf-8', | |
request = request) | |
def close_driver(): | |
driver.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment