Skip to content

Instantly share code, notes, and snippets.

@mpereira
Created April 16, 2011 06:13
Show Gist options
  • Save mpereira/922918 to your computer and use it in GitHub Desktop.
Save mpereira/922918 to your computer and use it in GitHub Desktop.
# encoding: utf-8
# Copyright (C) 2011 by Murilo Pereira <[email protected]>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
%w[getoptlong open-uri nokogiri].each { |gem| require gem }
module MadrugaoSuplementos
extend self
USAGE = <<-USAGE.gsub(/^ /, '')
Usage: ruby madrugao_suplementos_whey_crawler.rb [-h|--help] [options]
-s, --show-links Show the URL for each whey
-n, --no-colorize Show non-colorized output
USAGE
BASE_URL = 'http://www.madrugaosuplementos.com.br'
NUMBER_OF_WHEY_PAGES = 5
def whey_protein_page_url(options)
"#{BASE_URL}/produtos/Massa+Muscular/319_Whey+Protein.html/#{options[:page]}"
end
module Util
def to_decimal(string)
string.sub(',', '.').to_f
end
end
module Extractor
include Util
DECIMAL_REGEXP = '\d+(?:[,\.]\d+)?'
KILOGRAM_UNITS = %w[kg k KG K]
GRAM_UNITS = %w[g G]
POUND_UNITS = %w[lb lbs LB LBS]
UNITS = [KILOGRAM_UNITS, GRAM_UNITS, POUND_UNITS].flatten
# Captures a decimal number and its unit.
WEIGHT_REGEXP = /\((#{DECIMAL_REGEXP})\s*(#{UNITS.join('|')})\)/
# Captures a decimal number.
PRICE_REGEXP = /(?:R\$)?\s*(#{DECIMAL_REGEXP})/
def extract_name(node)
node.at_css('span.nome a').content
end
def extract_weight(node)
if match = node.at_css('span.nome a').content.match(WEIGHT_REGEXP)
weight, unit = match.captures
if KILOGRAM_UNITS.include?(unit)
to_decimal(weight) * 1000
elsif GRAM_UNITS.include?(unit)
normalize_weight(weight, unit)
elsif POUND_UNITS.include?(unit)
to_decimal(weight) * 453.59
end
else
STDERR.puts("error: '#{node.at_css('span.nome a').content}' didn't " <<
"have a valid weight unit. (grams, kilograms or pounds)")
end
end
def extract_price(node)
to_decimal(node.at_css('p.preco-com-desconto').content.match(PRICE_REGEXP)[1])
end
def extract_link(node)
BASE_URL + node.at_css('span.nome a')['href']
end
private
# Treat dumb weights strings like '1.500g', which when called #to_f on
# are converted to 1.5.
def normalize_weight(weight, unit)
if weight.include?('.') && GRAM_UNITS.include?(unit)
to_decimal(weight.delete('.'))
else
to_decimal(weight)
end
end
end
module WheyCrawler
extend self, Extractor
def crawl(options)
(1..NUMBER_OF_WHEY_PAGES).map do |page|
html = Nokogiri::HTML(open(MadrugaoSuplementos.whey_protein_page_url(:page => page)))
html.css('div.quadro_produto_index').map do |node|
whey_hash = { :name => extract_name(node),
:weight => extract_weight(node),
:price => extract_price(node) }
options[:show_links] ? whey_hash.merge(:link => extract_link(node)) : whey_hash
end
end.flatten
end
end
class Whey
attr_reader :name, :weight, :price, :link
def initialize(params)
@name = params[:name].to_s
@weight = params[:weight].to_f
@price = params[:price].to_f
@link = params[:link].to_s
end
def valid?
!name.nil? && !name.empty? && weight > 0 && price > 0
end
def grams_per_real
weight / price
end
def to_hash
whey_hash = { :name => name,
:weight => weight,
:price => price,
:grams_per_real => grams_per_real }
(!link.nil? && !link.empty?) ? whey_hash.merge(:link => link) : whey_hash
end
def to_s
to_hash.to_s
end
end
module Output
extend self
def puts_whey(whey, options = {})
puts(options[:colorize] ? colorize_whey(whey) : whey)
end
private
def colorize(text, color_code); "#{color_code}#{text}\033[0;0m"; end
def cyan(text); colorize(text, "\033[0;36m"); end
def purple(text); colorize(text, "\033[0;35m"); end
def green(text); colorize(text, "\033[0;32m"); end
def dark_gray(text); colorize(text, "\033[1;30m"); end
def white(text); colorize(text, "\033[1;37m"); end
def colorize_whey(whey)
whey.to_s.
gsub(/(:link)(=>)"(.*)"/) { purple($1) << dark_gray($2) << '"' << $3 << '"' }.
gsub(/:(name|weight|price|grams_per_real)(=>)(?:"(.*?)"|(.*?))(,|})/) do
purple(':' + $1) << dark_gray($2) << '"' << white($3 || $4) << '"' << $5
end.
gsub(/(^\{|\}$)/) { green($1) }.
gsub(',', dark_gray(',')).
gsub('"', dark_gray('"'))
end
end
module CLI
def self.run(options)
if options[:help]
abort(USAGE)
else
MadrugaoSuplementos::WheyCrawler.
crawl(:show_links => options[:show_links]).
map { |whey_params| MadrugaoSuplementos::Whey.new(whey_params) }.
select(&:valid?).
sort_by { |whey| whey.price/whey.weight }.
each { |whey| MadrugaoSuplementos::Output.puts_whey(whey, options) }
end
end
end
end
opts = GetoptLong.new(['--help', '-h', GetoptLong::NO_ARGUMENT],
['--show-links', '-s', GetoptLong::NO_ARGUMENT],
['--no-colorize', '-n', GetoptLong::NO_ARGUMENT])
options = { :colorize => true,
:show_links => false }
opts.each do |opt, arg|
case opt
when '--help'
options[:help] = true
when '--show-links'
options[:show_links] = true
when '--no-colorize'
options[:colorize] = false
end
end
MadrugaoSuplementos::CLI.run(options) if $0 == __FILE__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment