Last active
October 10, 2018 18:43
-
-
Save mdarby/ea257a694bc046e7ec1de2d111bbaa5c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
class ListingDescriptionUpdater | |
class << self | |
def update | |
file = '' | |
# Commented out the array of Place, Event, Job because Events and Jobs are required to have descriptions at point of creation so there should not be anything needed to be scraped from a site. | |
# If in the future the description field is no longer a required field they can be added back as shown below. | |
# [Place, Event, Job].each do |klass| | |
[Place].each do |klass| | |
klass.where(description: nil).each do |listing| | |
begin | |
file = open(listing.website) | |
rescue OpenURI::HTTPError => e | |
if e.message == '429 Too Many Requests' | |
sleep(60) | |
file = open(listing.website) | |
else | |
next | |
end | |
rescue RuntimeError => redirect | |
redirect_url = parse_redirect_url(redirect) | |
next if redirect_url.nil? | |
file = open(redirect_url.to_s) | |
rescue SocketError => e | |
next | |
end | |
doc = Nokogiri::HTML(file) | |
contents = parse_out_meta_description(doc, listing) | |
desc = parse_contents(contents) | |
update_description(listing, desc) unless desc.nil? | |
end | |
end | |
end | |
def parse_redirect_url(redirect) | |
pattern = /(?<=->\s)(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)/ | |
redirect_url = pattern.match(redirect.to_s) | |
end | |
def parse_out_meta_description(doc, listing) | |
contents = %w[description Description].map { |name| | |
next if doc.at("meta[name='#{name}']").nil? | |
doc.at("meta[name='#{name}']")['content'] | |
} | |
end | |
def parse_contents(contents) | |
desc = nil | |
if !contents[0].nil? | |
desc = contents[0] | |
elsif !contents[1].nil? | |
desc = contents[1] | |
end | |
desc | |
end | |
def update_description(listing, desc) | |
listing.update(description: desc) | |
puts "Description added for #{listing.website}" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment