Last active
May 21, 2018 08:04
-
-
Save tsaito-cyber/467d9fa92f6b36e1c6bd to your computer and use it in GitHub Desktop.
google-crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'nokogiri' | |
require 'open-uri' | |
require 'uri' | |
instanceID = 'xxxxxx' | |
## if you are blocked by google, you have to change your ip address. | |
## You can change it with "elastic ip address" provided by ec2. | |
def change_ip() | |
`./change-ip.sh #{instancdID}` | |
end | |
STDOUT.sync = true | |
STDERR.sync = true | |
class Array | |
def align_column | |
min = self.map {|x| x.size}.min | |
self.map {|x| x[0, min]} | |
end | |
end | |
def valid_url?(url) | |
return (url =~ URI::regexp) | |
end | |
def link(doc) | |
return doc.css('.r').map {|x| | |
x.css('a')[0][:href].split("?")[1].split("&")[0].split("=")[1]} | |
end | |
def text(doc, elem) | |
return doc.css(elem).map {|x| x.text.tr("\n", "")} | |
end | |
def crawling(keyword, n, delay, &block) | |
(0..(n/10-1)).each do |i| | |
count_of_failure = 0 | |
counter = 1 | |
begin | |
begin | |
query = "https://www.google.co.jp/search?q=" + | |
"#{URI.escape(keyword)}&oe=UTF-8&ie=UTF-8&hl=ja&start=#{i*10}" | |
doc = Nokogiri::HTML.parse(open(query)) | |
[link(doc).select {|x| valid_url?(x)}.map {|x| URI.unescape(x)}, | |
text(doc, '.r'), text(doc, 'span.st')] | |
.align_column.transpose.each do |x| | |
block.call([keyword, counter] + x) | |
counter += 1 | |
end | |
delay.call() # delay interval | |
count_of_failure = -1 | |
rescue OpenURI::HTTPError => error | |
# change ip address, and resume | |
change_ip() | |
count_of_failure += 1 | |
rescue => err | |
$stderr.puts err | |
count_of_failure += 1 | |
end | |
end while (count_of_failure >= 0 and count_of_failure < 3) | |
end | |
end | |
raise "ERROR: keywords file must be specified" if ARGV.empty? | |
keywords = File.read(ARGV[0]).split("\n") | |
keywords.each do |keyword| | |
crawling(keyword, 20, -> {sleep(rand(2) + 3)}) {|xs| | |
puts xs.map {|x| '"' + x.to_s + '"'}.join(",") | |
} | |
time = rand(10) + 10 | |
$stderr.puts time | |
sleep(time) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment