-
-
Save blahutka/9060572 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module CapybaraWithPhantomJs | |
include Capybara | |
# Create a new PhantomJS session in Capybara | |
def new_session | |
# Register PhantomJS (aka poltergeist) as the driver to use | |
Capybara.register_driver :poltergeist do |app| | |
Capybara::Poltergeist::Driver.new(app) | |
end | |
# Use XPath as the default selector for the find method | |
Capybara.default_selector = :xpath | |
# Start up a new thread | |
@session = Capybara::Session.new(:poltergeist) | |
# Report using a particular user agent | |
@session.driver.headers = { 'User-Agent' => | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X)" } | |
# Return the driver's session | |
@session | |
end | |
# Returns the current session's page | |
def html | |
session.html | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Add the mixin | |
require 'capybara_with_phantom_js' | |
# Google+ Scraper | |
# | |
# === Example | |
# | |
# g_plus = GooglePlusScraper.new(111044299943603359137) | |
# data = g_plus.to_h | |
# # => { id: 111044299943603359137, in_circles: 1234, timestamp: 123456789 } | |
# | |
class GooglePlusScraper | |
include CapybaraWithPhantomJs | |
def initialize(profile_id) | |
@profile_id = profile_id | |
end | |
# Return a hash | |
def to_h | |
data = { | |
:id => @profile_id, | |
:in_circles => in_circles, | |
:timestamp => Date.today.to_datetime.to_i | |
} | |
end | |
# Return the circle count as an integer | |
def in_circles | |
matches = tp_tx_hp | |
return 0 if matches.nil? | |
str = matches.find { |s| s.include?('have them in circles') } | |
(str.nil?) ? 0 : Integer(str.gsub(/,/, '').match(/\d+/)[0]) | |
end | |
# Return the text found in H3 tags | |
def tp_tx_hp | |
results = google_plus_page.search('//h3[@class="TP tx hp"]/span') | |
results = results.collect(&:text) | |
return nil if results.empty? | |
results | |
end | |
# Get the Google Plus page and locally cache it in an instance variable | |
def google_plus_page | |
unless @google_plus_page | |
new_session | |
visit "https://plus.google.com/u/0/#{@profile_id}/posts" | |
sleep 5 # give phantomjs 5 seconds and let the page fill itself in | |
@google_plus_page = Nokogiri::HTML.parse(html) | |
end | |
@google_plus_page | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
g_plus = GooglePlusScraper.new(111044299943603359137).to_h | |
# => { id: 111044299943603359137, in_circles: 1234, timestamp: 123456789 } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment