Last active
August 29, 2015 14:04
-
-
Save faucct/c2608cbac966502f0504 to your computer and use it in GitHub Desktop.
parser examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace :fragrantica do | |
require 'nokogiri' | |
require 'open-uri' | |
task :brands => :environment do | |
1.upto(11).each do |i| | |
brands_doc = Nokogiri::HTML(open("http://www.fragrantica.com/designers-#{i}/")) | |
brands_doc.css("div#col1 div.nduList a").each do |brand_anchor| | |
brand_name = brand_anchor.content[1..-2] | |
brand = Brand.where(name: brand_name).first_or_create do |brand| | |
puts "parsing brand '#{brand_name}'" | |
brand_doc = Nokogiri::HTML(open("http://www.fragrantica.com#{brand_anchor['href']}")) | |
brand.description = parse_brand_description(brand_doc) | |
brand.logo = parse_brand_logo(brand_doc) | |
brand.niche = parse_brand_niche(brand_doc) | |
brand.link = parse_brand_link(brand_doc) | |
brand.parent_company_id = parse_brand_parent_company_id(brand_doc) | |
brand.country_id = parse_brand_country_id(brand_doc) | |
brand.industry_id = parse_brand_industry_id(brand_doc) | |
end | |
end | |
end | |
end | |
def parse_brand_logo(brand_doc) | |
brand_logo_url = brand_doc.at_css("#col1 > div > img")['src'] | |
original_logo_url = 'http://fimgs.net/images/dizajneri/o.' + brand_logo_url.split("/")[-1].split('.')[1..-1].join('.') | |
begin | |
logo = URI.parse(brand_logo_url) | |
brand = Brand.new(logo: logo) | |
adapter = Paperclip.io_adapters.for(brand.logo) | |
unless Paperclip::MediaTypeSpoofDetector.using(adapter, brand.logo.original_filename).spoofed? | |
logo | |
end | |
rescue OpenURI::HTTPError | |
nil | |
end | |
end | |
def parse_brand_country_id(brand_doc) | |
brand_doc.css('#col1 > div > p > a').each do |anchor| | |
if anchor['href'].include?('country') | |
return BrandCountry.where(name: anchor.content).first_or_create.id | |
end | |
end | |
nil | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment