Last active
December 29, 2016 20:45
-
-
Save joaomdmoura/f7091bcf83b901f7c6645119b1ce87bc to your computer and use it in GitHub Desktop.
Pure Old Ruby Object (PORO) HTML scraper with a simple DSL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
class PageScrapper | |
attr_accessor :url, :selector | |
IMG_SRC_REGEX = /src="([a-zA-Z\/0-9:.-_]+\.[a-zA-Z]{3})/ | |
URL_REGEX = /http[s]?:\/\/[a-zA-Z0-9].+\.[a-z]{2,3}(\.[a-z]{2})?/ | |
def initialize(url) | |
@url = url | |
end | |
# | |
# Usage: | |
# ====================================================== | |
# scrapper = PageScrapper.new("https://techcrunch.com/") | |
# scrapper.scrap_text("h2.post-title") | |
# | |
# => [["WTF is a liquidation preference?"], ["Privacy is still alive and kicking in the digital age"], ...] | |
# | |
def scrap_text(selector) | |
result = [] | |
scrap(selector) do |entry| | |
trip = entry.text.split("\n").map {|t| t.split.join(" ")}.delete_if {|t| t.empty?} | |
result << trip | |
end | |
result | |
end | |
# | |
# Usage: | |
# ====================================================== | |
# scrapper = PageScrapper.new("https://techcrunch.com/") | |
# scrapper.scrap_img("div.block-content a img") | |
# | |
# => ["https://tctechcrunch2011.files.wordpress.com/2016/12/liquidation-preference.jpg", ...] | |
# | |
def scrap_img(selector) | |
result = [] | |
scrap(selector) do |entry| | |
trip = IMG_SRC_REGEX.match(entry.to_s)[1] | |
url = unless trip.include? "//" | |
root = URL_REGEX .match(@url)[0] | |
"#{root}/#{trip}" | |
else | |
trip | |
end | |
result << url | |
end | |
result | |
end | |
private | |
def scrap(selector) | |
doc = Nokogiri::HTML(open(url)) | |
entries = doc.css(selector) | |
entries.each {|entry| yield(entry)} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment