-
-
Save nguyenchiencong/371dcea8293b235c5dd072ed57b0fe47 to your computer and use it in GitHub Desktop.
Use Capybara w/ Poltergeist (PhantomJS) to scrape the text content from the body of an HTML page located at the given URL.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'capybara' | |
require 'capybara/poltergeist' | |
class Scraper | |
include Capybara::DSL | |
Capybara.register_driver :poltergeist do |app| | |
Capybara::Poltergeist::Driver.new app, | |
phantomjs_options: ['--load-images=no','--ignore-ssl-errors=yes'], | |
js_errors: false, | |
inspector: false, | |
debug: false | |
end | |
attr_accessor :document, :response_headers, :title, :metas, :text | |
def initialize(url) | |
@session = Capybara::Session.new(:poltergeist) | |
@session.driver.headers = { 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X)" } | |
get url | |
end | |
def get(url) | |
@session.visit url | |
@document = @session.document | |
@response_headers = @session.response_headers | |
@title = @session.title | |
@metas = @session.find_all('meta', visible: false).collect(&:native).collect(&:attributes) | |
@text = @document.text 'body' | |
@session.driver.quit | |
self | |
end | |
end | |
s = Scraper.new "http://www.humani.se" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment