Created
June 1, 2015 16:16
-
-
Save reiro/6d9679086f684ecf5ed6 to your computer and use it in GitHub Desktop.
ggg_parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ParserController < ApplicationController | |
require 'open-uri' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'thread' | |
require 'watir-webdriver' | |
def parse_sto | |
url = "file:///home/dima/Sites/sto_parser/%D0%A8%D0%B8%D0%BD%D1%8B,%20%D0%B4%D0%B8%D1%81%D0%BA%D0%B8%20%D0%BD%D0%B0%20%D0%B0%D0%B2%D1%82%D0%BE%20%D0%B2%20%D0%9C%D0%B8%D0%BD%D1%81%D0%BA%D0%B5.%20%D0%9C%D0%B0%D0%B3%D0%B0%D0%B7%D0%B8%D0%BD%D1%8B%20%D0%BF%D0%BE%20%D0%BF%D1%80%D0%BE%D0%B4%D0%B0%D0%B6%D0%B5%20%D1%88%D0%B8%D0%BD,%20%D0%BB%D0%B8%D1%82%D1%8B%D1%85%20%D0%B4%D0%B8%D1%81%D0%BA%D0%BE%D0%B2%20%D0%B2%20%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%B8%20-%20%D0%9A%D0%B0%D1%82%D0%B0%D0%BB%D0%BE%D0%B3%20TUT.BY.html" | |
a = Mechanize.new | |
page = a.get(url) | |
doc = Nokogiri::HTML(page.body) | |
city = [] | |
city_url = [] | |
doc.css('.content tr td ul li').each do |c| | |
city << c.text.strip.split.first | |
if !c.css('a').first.nil? | |
s = c.css('a').last['href'] | |
s.slice! "http://auto.tut.by/catalog" | |
s.slice! "/zapchasti_aksessuary/shini-diski/" | |
city_url << s | |
else | |
city_url << "-dubrovno" | |
end | |
end | |
h = {} | |
city.size.times do |i| | |
h["#{city.at(i)}"] = city_url.at(i) | |
end | |
url = "http://auto.tut.by/catalog/" | |
@links = [] | |
@types = [] | |
a = Mechanize.new | |
a.get(url) do |auto_page| | |
auto_page.at('.cat_categoryes_table_wrapper table.cat_categoryes tr').css('a').map do |link| | |
l = link['href'] | |
unless l.include? "finance" | |
@links << l[26..l.length] | |
@types << link.text.strip | |
end | |
end | |
end | |
@threads = [] | |
@links.size.times do |n| | |
h.size.times do |index| | |
l = @links.at(n) | |
value = h.values.at(index) | |
puts value | |
@sto_links = [] | |
url = "http://auto.tut.by/catalog#{value}#{l}" | |
puts url | |
#-------------------------------------------------------- | |
i = 1 | |
while i != 0 do | |
a = Mechanize.new | |
doc = Nokogiri::HTML(open(url)) | |
page = a.get(url) | |
if (page.uri.to_s != url) || (doc.css('ul.catalog_items_list li a').size == 0) | |
puts "Нету в этом городе" | |
puts page.uri | |
i = 0 | |
break | |
end | |
doc.css('#tab-company ul.catalog_items_list li.cat_list_item').each do |service_link| | |
@sto_links << service_link.css('.head a').first['href'] | |
end | |
if doc.css('.b-pagination li.p-next a').present? | |
url = doc.css('.b-pagination li.p-next a').first['href'] | |
i += 1 | |
end | |
i -= 1 | |
end | |
@sto_links.each do |sto_url| | |
begin | |
doc = Nokogiri::HTML(open(sto_url)) | |
sto = {} | |
@sto_type = StoType.find_by_name(@types.at(n).to_s) | |
puts @sto_type.name | |
sto[:name] = doc.css('.company_head h1').text | |
sto[:address] = doc.css('p.address a').text | |
sto[:description] = doc.css('#compdesc').text | |
sto[:email] = "" | |
doc.css('.cc-col-2 p.url').each do |p| | |
if p.text.include? "@" | |
b = Watir::Browser.new :phantomjs | |
b.goto sto_url | |
sto[:email] = b.elements(class: "url").last.text | |
b.close | |
else | |
sto[:site] = p.text | |
end | |
end | |
if !doc.css('p.phones').nil? | |
@phones = [] | |
doc.css('p.phones span.phone').each do |full_phone| | |
@phones << Phone.create(number: full_phone.text) | |
end | |
end | |
city = h.keys.at(index) | |
@city = City.find_by_name(city.to_s) | |
@sto_categories = {} | |
@cars = [] | |
doc.css('.tbl tr').each do |tr| | |
if tr.css('th').text == "Марка автомобиля" | |
@cars = tr.css('td').text.split(",") | |
else | |
@sto_categories["#{tr.css('th').text}"] = tr.css('td').text.split(",") | |
end | |
end | |
puts sto | |
a = Autoservice.find_by_name(sto[:name].to_s) | |
if a.nil? | |
@autoservice = Autoservice.new(sto) | |
unless @autoservice.sto_types.to_a.include?(@sto_type) | |
@autoservice.sto_types << @sto_type | |
end | |
@autoservice.city = @city | |
unless @phones.nil? | |
@phones.each do |phone| | |
@autoservice.phones << phone | |
end | |
end | |
unless @cars.nil? | |
@cars.each do |car| | |
@car = Car.find_or_create_by(name: car.to_s) | |
@autoservice.cars << @car | |
end | |
end | |
unless @sto_categories.nil? | |
@sto_categories.size.times do |i| | |
@sto_category = StoCategory.find_or_create_by(name: @sto_categories.keys.at(i).to_s) | |
@autoservice.sto_categories << @sto_category | |
@sto_categories.values.at(i).each do |service_type| | |
@service_type = ServiceType.find_or_create_by(name: service_type.to_s) | |
@sto_category.service_types << @service_type | |
@autoservice.service_types << @service_type | |
end | |
end | |
end | |
@autoservice.save | |
puts @autoservice.sto_types | |
else | |
a.sto_types << @sto_type | |
a.save | |
end | |
rescue Exception => e | |
sleep 10 | |
puts "Ощибка"+e.to_s | |
retry | |
end | |
end | |
end | |
end | |
redirect_to root_url | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Было бы неплохо на функции разбить все действия.
К примеру:
def parse_services
def parse_comments
def parse_cars
....
https://ru.wikipedia.org/wiki/%D0%97%D0%BE%D0%BB%D0%BE%D1%82%D0%BE%D0%B9_%D0%BC%D0%BE%D0%BB%D0%BE%D1%82%D0%BE%D0%BA