Created
May 30, 2014 08:19
-
-
Save makotoworld/0f296d5ad3724fdb1d98 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
# スクレイピング先のURL | |
url = 'http://ishidatozanjukunisshi.blogspot.jp' | |
#url = 'http://ishidatozanjukunisshi.blogspot.jp/search?updated-max=2010-06-20T21:17:00%2B09:00&max-results=7&start=985&by-date=false' | |
charset = nil | |
def url_set(url) | |
html = open(url) do |f| | |
charset = f.charset # 文字種別を取得 | |
f.read # htmlを読み込んで変数htmlに渡す | |
end | |
return html | |
end | |
def content_output(url, charset) | |
html = url_set(url) | |
doc = Nokogiri::HTML.parse(html, nil, charset) | |
doc.css('.post-outer').each do |node| | |
# p node | |
puts "---------------------" | |
puts node.css('.post-title a').text | |
puts node.css('.timestamp-link').attribute('href').value | |
puts node.css('.timestamp-link > abbr').attribute('title').value | |
puts node.css('.post-body').text | |
end | |
end | |
def content_check(url, charset) | |
html = url_set(url) | |
doc = Nokogiri::HTML.parse(html, nil, charset) | |
if doc.search('.blog-pager > span > .blog-pager-older-link').size == 1 then | |
url = doc.css('.blog-pager > span > .blog-pager-older-link').attribute('href').value | |
else | |
url = "" | |
end | |
return url | |
end | |
#content_output(url, charset) | |
#content_check(url, charset) | |
#exit | |
while url != "" do | |
puts url | |
content_output(url, charset) | |
url = content_check(url, charset) | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment