Skip to content

Instantly share code, notes, and snippets.

@xhiroga
Created April 15, 2017 08:20
Show Gist options
  • Save xhiroga/2fedada5202c1cff56f8d1d7d2fe73cd to your computer and use it in GitHub Desktop.
Save xhiroga/2fedada5202c1cff56f8d1d7d2fe73cd to your computer and use it in GitHub Desktop.
use selenium to click button. acquire full website.
# -*- coding: utf-8 -*-
import scrapy
class B2016Spider(scrapy.Spider):
name = "b2016"
allowed_domains = ["http://2016.spaceappschallenge.org/challenges/earth/sea-ice-app/projects"]
start_urls = ['http://2016.spaceappschallenge.org/challenges/earth/sea-ice-app/projects/']
# 要するにdef process_requestという名前でresponseを返すクラスが登録されていればいいのだろうか
# selenium_middleware.pyの配置で困ったが、nasa(myProject)直下でいいらしい
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"nasa.selenium_middleware.SeleniumMiddleware": 0,
},"DOWNLOAD_DELAY": 0.5,
}
def parse(self, response):
print ("****** this is response ******")
print (response)
print ("body")
print(response.css("body"))
# ここからのデータ取得がまたややこしい。
# 基本的にはcssセレクタを使い、html要素からタグ属性を指定して対応する値を取得するが、
# どうも、タグで挟まれている値はtext属性の値として取得するらしい。初見殺し!
i = 0
for sell in response.css("div.containers---columnarThird---3EFIy"):
i = i + 1
print ("Count: " + str(i))
print("Title: " + sell.css("div.card---cardHeaderTitle---3q2a1::text").extract_first())
print("Summary: " + sell.css("div.typography---summary---3nRFy span::text").extract_first())
# タグ.クラス, タグ タグみたいに宣言する 半角スペースが意味を持っている...!!!
pass
# -*- coding: utf-8 -*-
import os.path
import urllib.parse
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.common.keys import Keys as keys
from selenium.common.exceptions import NoSuchElementException # インポートしないと例外のキャッチができない
import time
driver = webdriver.PhantomJS()
class SeleniumMiddleware(object):
def process_request(self, request, spider):
driver.get(request.url)
print ("Waiting 1st Load 5 seconds")
time.sleep(5)
# for watch html file
# str = driver.page_source
# f = open('log.html', 'w')
# f.write(str)
# f.close()
# Load file until button dissapear...
click_loop = True
while click_loop:
try:
driver.find_element_by_css_selector("button.ui---chunkyButton---CHoDI").send_keys(keys.ENTER)
print ("Click and wait 3.5 second")
time.sleep(3.5)
except NoSuchElementException:
print ("Load finish!")
click_loop = False
str = driver.page_source
f = open('log.html', 'w')
f.write(str)
f.close()
return HtmlResponse(driver.current_url,
body = driver.page_source,
encoding = 'utf-8',
request = request)
def close_driver():
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment