Skip to content

Instantly share code, notes, and snippets.

@reiro
Created June 1, 2015 16:16
Show Gist options
  • Save reiro/6d9679086f684ecf5ed6 to your computer and use it in GitHub Desktop.
Save reiro/6d9679086f684ecf5ed6 to your computer and use it in GitHub Desktop.
ggg_parser
class ParserController < ApplicationController
require 'open-uri'
require 'nokogiri'
require 'mechanize'
require 'thread'
require 'watir-webdriver'
def parse_sto
url = "file:///home/dima/Sites/sto_parser/%D0%A8%D0%B8%D0%BD%D1%8B,%20%D0%B4%D0%B8%D1%81%D0%BA%D0%B8%20%D0%BD%D0%B0%20%D0%B0%D0%B2%D1%82%D0%BE%20%D0%B2%20%D0%9C%D0%B8%D0%BD%D1%81%D0%BA%D0%B5.%20%D0%9C%D0%B0%D0%B3%D0%B0%D0%B7%D0%B8%D0%BD%D1%8B%20%D0%BF%D0%BE%20%D0%BF%D1%80%D0%BE%D0%B4%D0%B0%D0%B6%D0%B5%20%D1%88%D0%B8%D0%BD,%20%D0%BB%D0%B8%D1%82%D1%8B%D1%85%20%D0%B4%D0%B8%D1%81%D0%BA%D0%BE%D0%B2%20%D0%B2%20%D0%91%D0%B5%D0%BB%D0%B0%D1%80%D1%83%D1%81%D0%B8%20-%20%D0%9A%D0%B0%D1%82%D0%B0%D0%BB%D0%BE%D0%B3%20TUT.BY.html"
a = Mechanize.new
page = a.get(url)
doc = Nokogiri::HTML(page.body)
city = []
city_url = []
doc.css('.content tr td ul li').each do |c|
city << c.text.strip.split.first
if !c.css('a').first.nil?
s = c.css('a').last['href']
s.slice! "http://auto.tut.by/catalog"
s.slice! "/zapchasti_aksessuary/shini-diski/"
city_url << s
else
city_url << "-dubrovno"
end
end
h = {}
city.size.times do |i|
h["#{city.at(i)}"] = city_url.at(i)
end
url = "http://auto.tut.by/catalog/"
@links = []
@types = []
a = Mechanize.new
a.get(url) do |auto_page|
auto_page.at('.cat_categoryes_table_wrapper table.cat_categoryes tr').css('a').map do |link|
l = link['href']
unless l.include? "finance"
@links << l[26..l.length]
@types << link.text.strip
end
end
end
@threads = []
@links.size.times do |n|
h.size.times do |index|
l = @links.at(n)
value = h.values.at(index)
puts value
@sto_links = []
url = "http://auto.tut.by/catalog#{value}#{l}"
puts url
#--------------------------------------------------------
i = 1
while i != 0 do
a = Mechanize.new
doc = Nokogiri::HTML(open(url))
page = a.get(url)
if (page.uri.to_s != url) || (doc.css('ul.catalog_items_list li a').size == 0)
puts "Нету в этом городе"
puts page.uri
i = 0
break
end
doc.css('#tab-company ul.catalog_items_list li.cat_list_item').each do |service_link|
@sto_links << service_link.css('.head a').first['href']
end
if doc.css('.b-pagination li.p-next a').present?
url = doc.css('.b-pagination li.p-next a').first['href']
i += 1
end
i -= 1
end
@sto_links.each do |sto_url|
begin
doc = Nokogiri::HTML(open(sto_url))
sto = {}
@sto_type = StoType.find_by_name(@types.at(n).to_s)
puts @sto_type.name
sto[:name] = doc.css('.company_head h1').text
sto[:address] = doc.css('p.address a').text
sto[:description] = doc.css('#compdesc').text
sto[:email] = ""
doc.css('.cc-col-2 p.url').each do |p|
if p.text.include? "@"
b = Watir::Browser.new :phantomjs
b.goto sto_url
sto[:email] = b.elements(class: "url").last.text
b.close
else
sto[:site] = p.text
end
end
if !doc.css('p.phones').nil?
@phones = []
doc.css('p.phones span.phone').each do |full_phone|
@phones << Phone.create(number: full_phone.text)
end
end
city = h.keys.at(index)
@city = City.find_by_name(city.to_s)
@sto_categories = {}
@cars = []
doc.css('.tbl tr').each do |tr|
if tr.css('th').text == "Марка автомобиля"
@cars = tr.css('td').text.split(",")
else
@sto_categories["#{tr.css('th').text}"] = tr.css('td').text.split(",")
end
end
puts sto
a = Autoservice.find_by_name(sto[:name].to_s)
if a.nil?
@autoservice = Autoservice.new(sto)
unless @autoservice.sto_types.to_a.include?(@sto_type)
@autoservice.sto_types << @sto_type
end
@autoservice.city = @city
unless @phones.nil?
@phones.each do |phone|
@autoservice.phones << phone
end
end
unless @cars.nil?
@cars.each do |car|
@car = Car.find_or_create_by(name: car.to_s)
@autoservice.cars << @car
end
end
unless @sto_categories.nil?
@sto_categories.size.times do |i|
@sto_category = StoCategory.find_or_create_by(name: @sto_categories.keys.at(i).to_s)
@autoservice.sto_categories << @sto_category
@sto_categories.values.at(i).each do |service_type|
@service_type = ServiceType.find_or_create_by(name: service_type.to_s)
@sto_category.service_types << @service_type
@autoservice.service_types << @service_type
end
end
end
@autoservice.save
puts @autoservice.sto_types
else
a.sto_types << @sto_type
a.save
end
rescue Exception => e
sleep 10
puts "Ощибка"+e.to_s
retry
end
end
end
end
redirect_to root_url
end
end
@wegorich
Copy link

wegorich commented Jun 1, 2015

Было бы неплохо на функции разбить все действия.

К примеру:

def parse_services

def parse_comments

def parse_cars
....

https://ru.wikipedia.org/wiki/%D0%97%D0%BE%D0%BB%D0%BE%D1%82%D0%BE%D0%B9_%D0%BC%D0%BE%D0%BB%D0%BE%D1%82%D0%BE%D0%BA

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment