Created
April 16, 2011 06:13
-
-
Save mpereira/922918 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
# Copyright (C) 2011 by Murilo Pereira <[email protected]> | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THE SOFTWARE. | |
%w[getoptlong open-uri nokogiri].each { |gem| require gem } | |
module MadrugaoSuplementos | |
extend self | |
USAGE = <<-USAGE.gsub(/^ /, '') | |
Usage: ruby madrugao_suplementos_whey_crawler.rb [-h|--help] [options] | |
-s, --show-links Show the URL for each whey | |
-n, --no-colorize Show non-colorized output | |
USAGE | |
BASE_URL = 'http://www.madrugaosuplementos.com.br' | |
NUMBER_OF_WHEY_PAGES = 5 | |
def whey_protein_page_url(options) | |
"#{BASE_URL}/produtos/Massa+Muscular/319_Whey+Protein.html/#{options[:page]}" | |
end | |
module Util | |
def to_decimal(string) | |
string.sub(',', '.').to_f | |
end | |
end | |
module Extractor | |
include Util | |
DECIMAL_REGEXP = '\d+(?:[,\.]\d+)?' | |
KILOGRAM_UNITS = %w[kg k KG K] | |
GRAM_UNITS = %w[g G] | |
POUND_UNITS = %w[lb lbs LB LBS] | |
UNITS = [KILOGRAM_UNITS, GRAM_UNITS, POUND_UNITS].flatten | |
# Captures a decimal number and its unit. | |
WEIGHT_REGEXP = /\((#{DECIMAL_REGEXP})\s*(#{UNITS.join('|')})\)/ | |
# Captures a decimal number. | |
PRICE_REGEXP = /(?:R\$)?\s*(#{DECIMAL_REGEXP})/ | |
def extract_name(node) | |
node.at_css('span.nome a').content | |
end | |
def extract_weight(node) | |
if match = node.at_css('span.nome a').content.match(WEIGHT_REGEXP) | |
weight, unit = match.captures | |
if KILOGRAM_UNITS.include?(unit) | |
to_decimal(weight) * 1000 | |
elsif GRAM_UNITS.include?(unit) | |
normalize_weight(weight, unit) | |
elsif POUND_UNITS.include?(unit) | |
to_decimal(weight) * 453.59 | |
end | |
else | |
STDERR.puts("error: '#{node.at_css('span.nome a').content}' didn't " << | |
"have a valid weight unit. (grams, kilograms or pounds)") | |
end | |
end | |
def extract_price(node) | |
to_decimal(node.at_css('p.preco-com-desconto').content.match(PRICE_REGEXP)[1]) | |
end | |
def extract_link(node) | |
BASE_URL + node.at_css('span.nome a')['href'] | |
end | |
private | |
# Treat dumb weights strings like '1.500g', which when called #to_f on | |
# are converted to 1.5. | |
def normalize_weight(weight, unit) | |
if weight.include?('.') && GRAM_UNITS.include?(unit) | |
to_decimal(weight.delete('.')) | |
else | |
to_decimal(weight) | |
end | |
end | |
end | |
module WheyCrawler | |
extend self, Extractor | |
def crawl(options) | |
(1..NUMBER_OF_WHEY_PAGES).map do |page| | |
html = Nokogiri::HTML(open(MadrugaoSuplementos.whey_protein_page_url(:page => page))) | |
html.css('div.quadro_produto_index').map do |node| | |
whey_hash = { :name => extract_name(node), | |
:weight => extract_weight(node), | |
:price => extract_price(node) } | |
options[:show_links] ? whey_hash.merge(:link => extract_link(node)) : whey_hash | |
end | |
end.flatten | |
end | |
end | |
class Whey | |
attr_reader :name, :weight, :price, :link | |
def initialize(params) | |
@name = params[:name].to_s | |
@weight = params[:weight].to_f | |
@price = params[:price].to_f | |
@link = params[:link].to_s | |
end | |
def valid? | |
!name.nil? && !name.empty? && weight > 0 && price > 0 | |
end | |
def grams_per_real | |
weight / price | |
end | |
def to_hash | |
whey_hash = { :name => name, | |
:weight => weight, | |
:price => price, | |
:grams_per_real => grams_per_real } | |
(!link.nil? && !link.empty?) ? whey_hash.merge(:link => link) : whey_hash | |
end | |
def to_s | |
to_hash.to_s | |
end | |
end | |
module Output | |
extend self | |
def puts_whey(whey, options = {}) | |
puts(options[:colorize] ? colorize_whey(whey) : whey) | |
end | |
private | |
def colorize(text, color_code); "#{color_code}#{text}\033[0;0m"; end | |
def cyan(text); colorize(text, "\033[0;36m"); end | |
def purple(text); colorize(text, "\033[0;35m"); end | |
def green(text); colorize(text, "\033[0;32m"); end | |
def dark_gray(text); colorize(text, "\033[1;30m"); end | |
def white(text); colorize(text, "\033[1;37m"); end | |
def colorize_whey(whey) | |
whey.to_s. | |
gsub(/(:link)(=>)"(.*)"/) { purple($1) << dark_gray($2) << '"' << $3 << '"' }. | |
gsub(/:(name|weight|price|grams_per_real)(=>)(?:"(.*?)"|(.*?))(,|})/) do | |
purple(':' + $1) << dark_gray($2) << '"' << white($3 || $4) << '"' << $5 | |
end. | |
gsub(/(^\{|\}$)/) { green($1) }. | |
gsub(',', dark_gray(',')). | |
gsub('"', dark_gray('"')) | |
end | |
end | |
module CLI | |
def self.run(options) | |
if options[:help] | |
abort(USAGE) | |
else | |
MadrugaoSuplementos::WheyCrawler. | |
crawl(:show_links => options[:show_links]). | |
map { |whey_params| MadrugaoSuplementos::Whey.new(whey_params) }. | |
select(&:valid?). | |
sort_by { |whey| whey.price/whey.weight }. | |
each { |whey| MadrugaoSuplementos::Output.puts_whey(whey, options) } | |
end | |
end | |
end | |
end | |
opts = GetoptLong.new(['--help', '-h', GetoptLong::NO_ARGUMENT], | |
['--show-links', '-s', GetoptLong::NO_ARGUMENT], | |
['--no-colorize', '-n', GetoptLong::NO_ARGUMENT]) | |
options = { :colorize => true, | |
:show_links => false } | |
opts.each do |opt, arg| | |
case opt | |
when '--help' | |
options[:help] = true | |
when '--show-links' | |
options[:show_links] = true | |
when '--no-colorize' | |
options[:colorize] = false | |
end | |
end | |
MadrugaoSuplementos::CLI.run(options) if $0 == __FILE__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment