Skip to content

Instantly share code, notes, and snippets.

@dirceu-jr
Created September 21, 2019 23:31
Show Gist options
  • Save dirceu-jr/03a5fdddf4b3108217e70ccc25c9194b to your computer and use it in GitHub Desktop.
Save dirceu-jr/03a5fdddf4b3108217e70ccc25c9194b to your computer and use it in GitHub Desktop.
This is a very old (2009) Gist of mine. It is a Ruby based web server (using Sinatra.rb) that parsed data from real estate agents from little london, Brazil. It used the awesome Yahoo Query Language (YQL) to "pre-parse".
require 'rubygems'
require 'sinatra'
configure do
require 'memcache'
require 'typhoeus'
require 'iconv'
require 'json'
require 'uri'
require 'libxml'
require 'erb'
require 'geokit'
# address to lat/lng
include Geokit::Geocoders
# encode master
def url_encode(u)
URI.escape(u, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
end
# Typhoeus runs HTTP requests in parallel while cleanly encapsulating handling logic.
# http://github.com/pauldix/typhoeus/tree/master
class YQL
include Typhoeus
remote_defaults :on_success => lambda {|response| JSON.parse(response.body, :max_nesting => false)},
:on_failure => lambda {|response| puts "error code: #{response.code}"},
:base_uri => "http://query.yahooapis.com/v1/public"
# rails-like routes for HTTP methods
define_remote_method :search, :path => "/yql?q=select%20*%20from%20html%20where%20url%3D%22:url%22%20and%20xpath%3D':xpath'&format=:format"
end
end
# HTTP requests hurts memcached saves
CACHE = MemCache::new('127.0.0.1:11211')
# http://www.imobiliariainglaterra.com.br
get '/imobiliariainglaterra' do
imoveis = []
if CACHE["imobiliariainglaterra"].nil?
search = YQL.search(
:url => url_encode('http://www.imobiliariainglaterra.com.br/aluguel.asp'),
:xpath => url_encode('/html/body/table/tr'),
:format => 'json'
)
search['query']['results']['tr'].each do |i|
ref = i['td'][0]
rua = i['td'][1]
dormitorios = i['td'][2]
valor = i['td'][7]
link = i['td'][8]
unless ref.nil? || ref['font'].nil? || ref['font']['content'].nil?
if ref['font']['content'] =~ /L-CA/
rua = rua['font']['content'].split('(')[0] unless rua['font'].nil?
dormitorios = dormitorios['font']['content'] unless dormitorios['font'].nil?
valor = valor['font']['content'] unless valor['font'].nil?
link = ["http://www.imobiliariainglaterra.com.br/", link['div']['a']['href']].join('') unless link['div'].nil?
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
# :dormitorios => dormitorios,
# :valor => valor,
:link => link,
}
end
end
end
CACHE["imobiliariainglaterra"] = imoveis
else
imoveis = CACHE["imobiliariainglaterra"]
end
[params[:callback], '(', imoveis.to_json,')']
end
# http://www.imobiliariaatual.com.br
get '/imobiliariaatual' do
imoveis = []
if CACHE["imobiliariaatual"].nil?
search = YQL.search(
:url => url_encode('http://www.imobiliariaatual.com.br/aluguel.asp'),
:xpath => url_encode('/html/body/table/tr'),
:format => 'json'
)
search['query']['results']['tr'].each do |i|
ref = i['td'][0]
rua = i['td'][1]
dormitorios = i['td'][2]
valor = i['td'][7]
link = i['td'][8]
unless ref.nil? || ref['font'].nil? || ref['font']['content'].nil?
if ref['font']['content'] =~ /L-CA/
rua = rua['font']['content'].split('(')[0] unless rua['font'].nil?
dormitorios = dormitorios['font']['content'] unless dormitorios['font'].nil?
valor = valor['font']['content'] unless valor['font'].nil?
link = ["http://www.imobiliariaatual.com.br/", link['div']['a']['href']].join('') unless link['div'].nil?
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
# :dormitorios => dormitorios,
# :valor => valor,
:link => link,
}
end
end
end
CACHE["imobiliariaatual"] = imoveis
else
imoveis = CACHE["imobiliariaatual"]
end
[params[:callback], '(', imoveis.to_json,')']
end
# http://www.imobiliariadelta.com
get '/imobiliariadelta' do
imoveis = []
if CACHE["imobiliariadelta"].nil?
search = YQL.search(
:url => url_encode('http://www.imobiliariadelta.com/engine.php?id=1444&page=resultado&cd_negocio=1&cd_tipo=11611'),
:xpath => url_encode('//table[@width="400"]'),
:format => 'json'
)
search['query']['results']['table'].each do |i|
lvl1 = i['tr'][1]['td'][1]
lvl2 = lvl1['table'] unless lvl1.nil?
lvl3 = lvl2[1]['tr']['td'][0]['table']['tr']['td'][1]['a']['href'] unless lvl2.nil?
unless lvl1.nil? || lvl2.nil? || lvl3.nil?
cid = /cd_imovel=(.*)/.match(lvl3)[1]
url = "http://www.imobiliariadelta.com/detalhes_geral.php?id=1444&cd_imovel=#{cid}"
search = YQL.search(
:url => url_encode(url),
:xpath => url_encode('/html/body'),
:format => 'xml',
:on_success => lambda do |response|
LibXML::XML::Parser.string(response.body).parse
end
)
rua = search.find('//body/div/div/div/div/div[4]/p')[0].first.to_s.strip
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
# :dormitorios => search.find('//body/div/div/div/div/div[4]/p/font')[0].first.to_s.strip,
# :preco => search.find('//body/div/div[2]/div[6]/div/span')[0].first.to_s.split('R$')[1].strip.gsub(/&#xA0;/, ''),
:link => url
}
end
end
CACHE["imobiliariadelta"] = imoveis
else
imoveis = CACHE["imobiliariadelta"]
end
[params[:callback], '(', imoveis.to_json,')']
end
# http://www.imobiliariaavenida.com.br
get '/imobiliariaavenida' do
imoveis = []
if CACHE["imobiliariaavenida"].nil?
search = YQL.search(
:url => url_encode('http://www.imobiliariaavenida.com.br/corpo_todos.php?txcomercializacao=loca%E7%E3o&txtipo=casa'),
:xpath => url_encode('//table[@width="95%"]/tr'),
:format => 'json'
)
search['query']['results'].each do |i|
i[1].each do |k|
link = /\((.*)\)/.match(k['td'][2]['div']['a']['href'])[1] unless k['td'][2].nil? || k['td'][2]['div']['a'].nil?
rua = k['td'][0]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][0].nil? || k['td'][0]['div']['a'].nil?
dormitorios = k['td'][1]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][1].nil? || k['td'][1]['div']['a'].nil?
valor = k['td'][2]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][2].nil? || k['td'][2]['div']['a'].nil?
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
# :dormitorios => dormitorios,
# :valor => valor,
:link => ['http://www.imobiliariaavenida.com.br/pop_detalhes.php?cdimovel=', link].join('')
} unless rua.nil? || valor.nil?
end
end
CACHE["imobiliariaavenida"] = imoveis
else
imoveis = CACHE["imobiliariaavenida"]
end
[params[:callback], '(', imoveis.to_json,')']
end
# http://www.ihimoveis.com.br/
get '/ihimoveis' do
imoveis = []
if CACHE["ihimoveis"].nil?
search = YQL.search(
:url => url_encode('http://www.ihimoveis.com.br/imoveis.php?situacao=locacao&tipo=3'),
:xpath => url_encode('//div[@id="destaque_fundo"]'),
:format => 'xml',
:on_success => lambda do |response|
LibXML::XML::Parser.string(response.body.gsub(/<br\/>/, '')).parse
end
)
search.find('//div[@id="destaque_fundo"]').each do |o|
if /.*quarto.*/.match(o.find('div/a/strong[2]')[0].first.to_s)
dormitorios = /.*quarto.*/.match(o.find('div/a/strong[2]')[0].first.to_s)[0].split('quarto')[0].strip
elsif /.*quarto.*/.match(o.find('div/a/strong[3]')[0].first.to_s)
dormitorios = /.*quarto.*/.match(o.find('div/a/strong[3]')[0].first.to_s)[0].split('quarto')[0].strip
end
rua = o.find('div/a')[0].inner_xml.split('strong>')[-1].split('Londrina')[0].strip
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
# :dormitorios => dormitorios,
# :valor => o.find('div[@id="destaques_valor"]/a')[0].inner_xml.split('R$')[1].strip,
:link => ["http://www.ihimoveis.com.br/", o.find('div[@id="destaques_valor"]/a')[0]['href']].join('')
}
end
CACHE["ihimoveis"] = imoveis
else
imoveis = CACHE["ihimoveis"]
end
[params[:callback], '(', imoveis.to_json,')']
end
# http://www.imobiliariasenador.com.br
get '/imobiliariasenador' do
imoveis = []
if CACHE["imobiliariasenador"].nil?
search = YQL.search(
:url => url_encode('http://www.imobiliariasenador.com.br/corpo_todos.php?txcomercializacao=loca%E7%E3o&txtipo=casa'),
:xpath => url_encode('/html/body/table/tr/td/div/form/table/tr/td/table/tr[2]/td/table/tr'),
:format => 'json'
)
search['query']['results']['tr'].each do |i|
rua = i['td']['table']['tr'][1]['td'][0]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' ')
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
:link => ["http://www.imobiliariasenador.com.br/pop_detalhes.php?cdimovel=", /\((.*)\)/.match(i['td']['table']['onclick'])[1]].join('')
# :dormitorios => i['td']['table']['tr'][1]['td'][1]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' '),
# :valor => i['td']['table']['tr'][1]['td'][6]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' ').split('R$')[1].strip
}
end
CACHE["imobiliariasenador"] = imoveis
else
imoveis = CACHE["imobiliariasenador"]
end
[params[:callback], '(', imoveis.to_json,')']
end
# http://www.sub100.com.br
get '/imobiliariaperez' do
imoveis = []
if CACHE["imobiliariaperez"].nil?
pagina = 1
while
search = YQL.search(
:url => url_encode("http://www.sub100.com.br/empresas/imob/imobiliariaperez/mostra_resultado_rapido.php?b_negocio=Locacao&b_tipo=CASAS&b_cidade=57&b_bairro=TODOS&b_dormitorios=TODOS&b_valores=TODOS&PV=#{pagina}"),
:xpath => url_encode('//div[@class="AT_lista"]'),
:format => 'json'
)
results = 0
search['query']['results']['div'].each do |i|
url = i['div'][1]['span']['a']['href'] unless i['div'].nil? || i['div'][1].nil? || i['div'][1]['span'].nil?
unless url.nil?
search = YQL.search(
:url => url_encode("http://www.sub100.com.br/empresas/imob/imobiliariaperez/#{url}"),
:xpath => url_encode('//body'),
:format => 'xml',
:on_success => lambda do |response|
LibXML::XML::Parser.string(response.body.gsub(/<br\/>/, '')).parse
end
)
if /.*R\$.*/.match(search.find('//td[@class="clCell2"]/strong')[0].first.to_s)
valor = search.find('//td[@class="clCell2"]/strong')[0].first.to_s.split('R$')[1].strip
elsif /.*R\$.*/.match(search.find('//td[@class="clCell2"]/strong')[1].first.to_s)
valor = search.find('//td[@class="clCell2"]/strong')[1].first.to_s.split('R$')[1].strip
end
rua = search.find('//td[@class="clCell2"]/p')[0].first.to_s.gsub(/&#xA0;/, '').strip
geo = YahooGeocoder.geocode([rua, ' - Londrina'].join(''))
imoveis << {
# :rua => rua,
:geo => [geo.lat, geo.lng],
:link => ["http://www.sub100.com.br/empresas/imob/imobiliariaperez/", url].join('')
# :dormitorios => search.find('//div[@class="clCell"]')[0].first.to_s.split('Dormit&#xFFFD;rio')[0].split('>')[1].strip,
# :valor => valor,
}
results += 1
end
end
if results < 15
break
end
pagina += 1
end
CACHE["imobiliariaperez"] = imoveis
else
imoveis = CACHE["imobiliariaperez"]
end
[params[:callback], '(', imoveis.to_json,')']
end
get '/' do
erb :index
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment