Last active
May 12, 2017 18:52
-
-
Save mclosson/74690ac4b56fe5956d858e428e2e7b62 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# May 2017 Google SERP (Search Engine Result Page) extraction tool | |
# Written and tested with Ruby 2.4 but likely works with any Ruby 2.0 or above | |
# | |
# Usage: | |
# | |
# Customize these settings to your liking: | |
# user_agent: the browser user_agent string to be sent with each request | |
# total_pages: total number of pages requested (10 results per page) | |
# query: the search query to run | |
# | |
# $ gem install mechanize | |
# $ ruby serpslurp.rb > output.txt | |
require 'mechanize' | |
ignored_titles = ['Website', 'Directions', 'More places', 'Cached', 'Similar'] | |
ignored_links = ['javascript:;', '/search?q='] | |
baseurl = "https://www.google.com/search" | |
user_agent = 'Mac Safari' | |
total_pages = 10 | |
query = 'dogs' | |
agent = Mechanize.new | |
agent.user_agent_alias = user_agent | |
total_pages.times do |page_number| | |
offset = page_number * 10 | |
url = "#{baseurl}?q=#{query}&start=#{offset}" | |
page = agent.get(url) | |
page | |
.css('#rso a') | |
.reject { |element| ignored_titles.include?(element.text) } | |
.reject { |element| element.text.to_s.strip.empty? } | |
.reject { |element| element.attributes['href'].nil? } | |
.reject do |element| | |
ignored_links.select do |link| | |
element.attributes['href'].value.start_with?(link) | |
end.any? | |
end | |
.each_with_index do |element, index| | |
serp_ranking = offset + index + 1 | |
puts "#{serp_ranking}: #{element.text} - #{element.attributes['href'].value}" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment