Created
September 21, 2019 23:31
-
-
Save dirceu-jr/03a5fdddf4b3108217e70ccc25c9194b to your computer and use it in GitHub Desktop.
This is a very old (2009) Gist of mine. It is a Ruby based web server (using Sinatra.rb) that parsed data from real estate agents from little london, Brazil. It used the awesome Yahoo Query Language (YQL) to "pre-parse".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'sinatra' | |
configure do | |
require 'memcache' | |
require 'typhoeus' | |
require 'iconv' | |
require 'json' | |
require 'uri' | |
require 'libxml' | |
require 'erb' | |
require 'geokit' | |
# address to lat/lng | |
include Geokit::Geocoders | |
# encode master | |
def url_encode(u) | |
URI.escape(u, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")) | |
end | |
# Typhoeus runs HTTP requests in parallel while cleanly encapsulating handling logic. | |
# http://github.com/pauldix/typhoeus/tree/master | |
class YQL | |
include Typhoeus | |
remote_defaults :on_success => lambda {|response| JSON.parse(response.body, :max_nesting => false)}, | |
:on_failure => lambda {|response| puts "error code: #{response.code}"}, | |
:base_uri => "http://query.yahooapis.com/v1/public" | |
# rails-like routes for HTTP methods | |
define_remote_method :search, :path => "/yql?q=select%20*%20from%20html%20where%20url%3D%22:url%22%20and%20xpath%3D':xpath'&format=:format" | |
end | |
end | |
# HTTP requests hurts memcached saves | |
CACHE = MemCache::new('127.0.0.1:11211') | |
# http://www.imobiliariainglaterra.com.br | |
get '/imobiliariainglaterra' do | |
imoveis = [] | |
if CACHE["imobiliariainglaterra"].nil? | |
search = YQL.search( | |
:url => url_encode('http://www.imobiliariainglaterra.com.br/aluguel.asp'), | |
:xpath => url_encode('/html/body/table/tr'), | |
:format => 'json' | |
) | |
search['query']['results']['tr'].each do |i| | |
ref = i['td'][0] | |
rua = i['td'][1] | |
dormitorios = i['td'][2] | |
valor = i['td'][7] | |
link = i['td'][8] | |
unless ref.nil? || ref['font'].nil? || ref['font']['content'].nil? | |
if ref['font']['content'] =~ /L-CA/ | |
rua = rua['font']['content'].split('(')[0] unless rua['font'].nil? | |
dormitorios = dormitorios['font']['content'] unless dormitorios['font'].nil? | |
valor = valor['font']['content'] unless valor['font'].nil? | |
link = ["http://www.imobiliariainglaterra.com.br/", link['div']['a']['href']].join('') unless link['div'].nil? | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
# :dormitorios => dormitorios, | |
# :valor => valor, | |
:link => link, | |
} | |
end | |
end | |
end | |
CACHE["imobiliariainglaterra"] = imoveis | |
else | |
imoveis = CACHE["imobiliariainglaterra"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
# http://www.imobiliariaatual.com.br | |
get '/imobiliariaatual' do | |
imoveis = [] | |
if CACHE["imobiliariaatual"].nil? | |
search = YQL.search( | |
:url => url_encode('http://www.imobiliariaatual.com.br/aluguel.asp'), | |
:xpath => url_encode('/html/body/table/tr'), | |
:format => 'json' | |
) | |
search['query']['results']['tr'].each do |i| | |
ref = i['td'][0] | |
rua = i['td'][1] | |
dormitorios = i['td'][2] | |
valor = i['td'][7] | |
link = i['td'][8] | |
unless ref.nil? || ref['font'].nil? || ref['font']['content'].nil? | |
if ref['font']['content'] =~ /L-CA/ | |
rua = rua['font']['content'].split('(')[0] unless rua['font'].nil? | |
dormitorios = dormitorios['font']['content'] unless dormitorios['font'].nil? | |
valor = valor['font']['content'] unless valor['font'].nil? | |
link = ["http://www.imobiliariaatual.com.br/", link['div']['a']['href']].join('') unless link['div'].nil? | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
# :dormitorios => dormitorios, | |
# :valor => valor, | |
:link => link, | |
} | |
end | |
end | |
end | |
CACHE["imobiliariaatual"] = imoveis | |
else | |
imoveis = CACHE["imobiliariaatual"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
# http://www.imobiliariadelta.com | |
get '/imobiliariadelta' do | |
imoveis = [] | |
if CACHE["imobiliariadelta"].nil? | |
search = YQL.search( | |
:url => url_encode('http://www.imobiliariadelta.com/engine.php?id=1444&page=resultado&cd_negocio=1&cd_tipo=11611'), | |
:xpath => url_encode('//table[@width="400"]'), | |
:format => 'json' | |
) | |
search['query']['results']['table'].each do |i| | |
lvl1 = i['tr'][1]['td'][1] | |
lvl2 = lvl1['table'] unless lvl1.nil? | |
lvl3 = lvl2[1]['tr']['td'][0]['table']['tr']['td'][1]['a']['href'] unless lvl2.nil? | |
unless lvl1.nil? || lvl2.nil? || lvl3.nil? | |
cid = /cd_imovel=(.*)/.match(lvl3)[1] | |
url = "http://www.imobiliariadelta.com/detalhes_geral.php?id=1444&cd_imovel=#{cid}" | |
search = YQL.search( | |
:url => url_encode(url), | |
:xpath => url_encode('/html/body'), | |
:format => 'xml', | |
:on_success => lambda do |response| | |
LibXML::XML::Parser.string(response.body).parse | |
end | |
) | |
rua = search.find('//body/div/div/div/div/div[4]/p')[0].first.to_s.strip | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
# :dormitorios => search.find('//body/div/div/div/div/div[4]/p/font')[0].first.to_s.strip, | |
# :preco => search.find('//body/div/div[2]/div[6]/div/span')[0].first.to_s.split('R$')[1].strip.gsub(/ /, ''), | |
:link => url | |
} | |
end | |
end | |
CACHE["imobiliariadelta"] = imoveis | |
else | |
imoveis = CACHE["imobiliariadelta"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
# http://www.imobiliariaavenida.com.br | |
get '/imobiliariaavenida' do | |
imoveis = [] | |
if CACHE["imobiliariaavenida"].nil? | |
search = YQL.search( | |
:url => url_encode('http://www.imobiliariaavenida.com.br/corpo_todos.php?txcomercializacao=loca%E7%E3o&txtipo=casa'), | |
:xpath => url_encode('//table[@width="95%"]/tr'), | |
:format => 'json' | |
) | |
search['query']['results'].each do |i| | |
i[1].each do |k| | |
link = /\((.*)\)/.match(k['td'][2]['div']['a']['href'])[1] unless k['td'][2].nil? || k['td'][2]['div']['a'].nil? | |
rua = k['td'][0]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][0].nil? || k['td'][0]['div']['a'].nil? | |
dormitorios = k['td'][1]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][1].nil? || k['td'][1]['div']['a'].nil? | |
valor = k['td'][2]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][2].nil? || k['td'][2]['div']['a'].nil? | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
# :dormitorios => dormitorios, | |
# :valor => valor, | |
:link => ['http://www.imobiliariaavenida.com.br/pop_detalhes.php?cdimovel=', link].join('') | |
} unless rua.nil? || valor.nil? | |
end | |
end | |
CACHE["imobiliariaavenida"] = imoveis | |
else | |
imoveis = CACHE["imobiliariaavenida"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
# http://www.ihimoveis.com.br/ | |
get '/ihimoveis' do | |
imoveis = [] | |
if CACHE["ihimoveis"].nil? | |
search = YQL.search( | |
:url => url_encode('http://www.ihimoveis.com.br/imoveis.php?situacao=locacao&tipo=3'), | |
:xpath => url_encode('//div[@id="destaque_fundo"]'), | |
:format => 'xml', | |
:on_success => lambda do |response| | |
LibXML::XML::Parser.string(response.body.gsub(/<br\/>/, '')).parse | |
end | |
) | |
search.find('//div[@id="destaque_fundo"]').each do |o| | |
if /.*quarto.*/.match(o.find('div/a/strong[2]')[0].first.to_s) | |
dormitorios = /.*quarto.*/.match(o.find('div/a/strong[2]')[0].first.to_s)[0].split('quarto')[0].strip | |
elsif /.*quarto.*/.match(o.find('div/a/strong[3]')[0].first.to_s) | |
dormitorios = /.*quarto.*/.match(o.find('div/a/strong[3]')[0].first.to_s)[0].split('quarto')[0].strip | |
end | |
rua = o.find('div/a')[0].inner_xml.split('strong>')[-1].split('Londrina')[0].strip | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
# :dormitorios => dormitorios, | |
# :valor => o.find('div[@id="destaques_valor"]/a')[0].inner_xml.split('R$')[1].strip, | |
:link => ["http://www.ihimoveis.com.br/", o.find('div[@id="destaques_valor"]/a')[0]['href']].join('') | |
} | |
end | |
CACHE["ihimoveis"] = imoveis | |
else | |
imoveis = CACHE["ihimoveis"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
# http://www.imobiliariasenador.com.br | |
get '/imobiliariasenador' do | |
imoveis = [] | |
if CACHE["imobiliariasenador"].nil? | |
search = YQL.search( | |
:url => url_encode('http://www.imobiliariasenador.com.br/corpo_todos.php?txcomercializacao=loca%E7%E3o&txtipo=casa'), | |
:xpath => url_encode('/html/body/table/tr/td/div/form/table/tr/td/table/tr[2]/td/table/tr'), | |
:format => 'json' | |
) | |
search['query']['results']['tr'].each do |i| | |
rua = i['td']['table']['tr'][1]['td'][0]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' ') | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
:link => ["http://www.imobiliariasenador.com.br/pop_detalhes.php?cdimovel=", /\((.*)\)/.match(i['td']['table']['onclick'])[1]].join('') | |
# :dormitorios => i['td']['table']['tr'][1]['td'][1]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' '), | |
# :valor => i['td']['table']['tr'][1]['td'][6]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' ').split('R$')[1].strip | |
} | |
end | |
CACHE["imobiliariasenador"] = imoveis | |
else | |
imoveis = CACHE["imobiliariasenador"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
# http://www.sub100.com.br | |
get '/imobiliariaperez' do | |
imoveis = [] | |
if CACHE["imobiliariaperez"].nil? | |
pagina = 1 | |
while | |
search = YQL.search( | |
:url => url_encode("http://www.sub100.com.br/empresas/imob/imobiliariaperez/mostra_resultado_rapido.php?b_negocio=Locacao&b_tipo=CASAS&b_cidade=57&b_bairro=TODOS&b_dormitorios=TODOS&b_valores=TODOS&PV=#{pagina}"), | |
:xpath => url_encode('//div[@class="AT_lista"]'), | |
:format => 'json' | |
) | |
results = 0 | |
search['query']['results']['div'].each do |i| | |
url = i['div'][1]['span']['a']['href'] unless i['div'].nil? || i['div'][1].nil? || i['div'][1]['span'].nil? | |
unless url.nil? | |
search = YQL.search( | |
:url => url_encode("http://www.sub100.com.br/empresas/imob/imobiliariaperez/#{url}"), | |
:xpath => url_encode('//body'), | |
:format => 'xml', | |
:on_success => lambda do |response| | |
LibXML::XML::Parser.string(response.body.gsub(/<br\/>/, '')).parse | |
end | |
) | |
if /.*R\$.*/.match(search.find('//td[@class="clCell2"]/strong')[0].first.to_s) | |
valor = search.find('//td[@class="clCell2"]/strong')[0].first.to_s.split('R$')[1].strip | |
elsif /.*R\$.*/.match(search.find('//td[@class="clCell2"]/strong')[1].first.to_s) | |
valor = search.find('//td[@class="clCell2"]/strong')[1].first.to_s.split('R$')[1].strip | |
end | |
rua = search.find('//td[@class="clCell2"]/p')[0].first.to_s.gsub(/ /, '').strip | |
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
imoveis << { | |
# :rua => rua, | |
:geo => [geo.lat, geo.lng], | |
:link => ["http://www.sub100.com.br/empresas/imob/imobiliariaperez/", url].join('') | |
# :dormitorios => search.find('//div[@class="clCell"]')[0].first.to_s.split('Dormit�rio')[0].split('>')[1].strip, | |
# :valor => valor, | |
} | |
results += 1 | |
end | |
end | |
if results < 15 | |
break | |
end | |
pagina += 1 | |
end | |
CACHE["imobiliariaperez"] = imoveis | |
else | |
imoveis = CACHE["imobiliariaperez"] | |
end | |
[params[:callback], '(', imoveis.to_json,')'] | |
end | |
get '/' do | |
erb :index | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment