Created
September 17, 2014 10:07
-
-
Save waaa/38db9a746cfb27f27356 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'mechanize' | |
class AvitoParser | |
# Base method in which we fetch seling engine with our search query | |
# and go through given number of pages with god blessed regexps | |
def fetch_engine(search_query, number_of_pages, city) | |
@search_query = search_query | |
host = "http://avito.ru" | |
items_rgxp = /\n<div class=\"t_i_i t_i(.*?)<\/div> <\/div> <\/div>/m | |
next_page_rgxp = /<a class=\"next\" href=\"(.*)\">Следующая/m | |
#fetching machine | |
a = Mechanize.new { |agent| | |
agent.user_agent_alias = 'Mac Safari' | |
} | |
raw = a.get("#{host}/#{city}?q=#{search_query}") # Mechanize::Page object | |
page = raw.body.to_s.encode("UTF-8",raw.encoding,:invalid=>:replace,:undef=>:replace) | |
@goods = [] # it will be final set of parsed goods | |
counter = 0 | |
counter.upto(number_of_pages) { | |
items = page.scan(items_rgxp) # we'll get array of html-nodes | |
items.each do |item| | |
item = item.first | |
parsed_item = parse_item(item) # it's hash with such data as price, date and title | |
if parsed_item | |
@goods << parsed_item | |
speak(parsed_item[:date], counter) if parsed_item[:date] != @date | |
@date = parsed_item[:date] | |
end | |
end | |
link_to_next_page = page.scan(next_page_rgxp).first.first | |
raw = a.get(host + link_to_next_page) # go further | |
page = raw.body.to_s.encode("UTF-8",raw.encoding,:invalid=>:replace,:undef=>:replace) | |
counter += 1 | |
} | |
@goods | |
end | |
def parse_item item | |
title_rgxp = /\" title=\"(.*?)\">\n/m | |
price_rgxp = /\n <span>.*<\/span> <span>руб.<\/span>/m | |
date_rgxp = /<div class=\"t_i_date\">\n(.*)\n?/ | |
title = item.scan(title_rgxp).last.first | |
# we don't need ad which title doesn't contain our query itself | |
return if title.scan(@search_query + ' ').empty? | |
price = item.scan(price_rgxp) | |
if price.empty? | |
price = 0 # sometimes seller doesn't set any price, we don't care | |
else | |
# price is a string like "8 000 руб." | |
# here we make it an integer like 8000 | |
price = price.first.scan(/\d/).inject{|x,d|x+=d}.to_i | |
end | |
date_str = item.scan(date_rgxp).first.first | |
date = parse_date(date_str) | |
{:title => title, :price => price, :date => date} | |
end | |
# date may look like "12 авг." or "16 июня", we need to parse it | |
def parse_date str | |
if !str.scan(/сегодня/i).empty? | |
date = Date.today | |
elsif !str.scan(/вчера/i).empty? | |
date = Date.today-1 | |
else | |
arr = str.scan(/(\d{1,2})\s(.*)/).first | |
case arr[1] | |
when 'авг.' | |
month = 8 | |
when 'июля' | |
month = 7 | |
when 'июня' | |
month = 6 | |
when 'мая' | |
month = 5 | |
end | |
date = Date.strptime("#{arr[0]}.#{month}","%d.%m") | |
end | |
end | |
def calculate_the_mean_value | |
sum = 0 | |
days_sum = 0 | |
@goods.each do |item| | |
sum += item[:price] # counting the sum of prices of all items | |
date_count = Date.today - item[:date] # counting how old this ad is | |
date_count = 1 if date_count == 0 # if it's today's ad, we get zero, which we wouldn't like | |
days_sum += date_count # counting sum of periods of all items | |
end | |
mean_price = sum.to_f/@goods.count | |
mean_period = days_sum.to_f/@goods.count | |
@mean_values = | |
{ | |
:total_count => @goods.count, | |
:mean_price => mean_price, | |
:mean_period => mean_period | |
} | |
end | |
def speak date=nil, page=nil | |
if date | |
puts "processing #{date} on page #{page + 1}.." | |
elsif @mean_values | |
puts "=========================================" | |
puts "We've got #{@mean_values[:total_count]} items," | |
puts "their mean price is #{@mean_values[:mean_price]} and mean period of sale is #{@mean_values[:mean_period]}" | |
end | |
end | |
def main(search_query, number_of_pages=100, city='moskva') | |
fetch_engine(search_query, number_of_pages, city) | |
calculate_the_mean_value | |
speak | |
end | |
end | |
AvitoParser.new.main("велосипед") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment