Skip to content

Instantly share code, notes, and snippets.

@skateman
Created June 7, 2015 10:40
Show Gist options
  • Save skateman/d1c1bdccda7405f1c6ed to your computer and use it in GitHub Desktop.
Save skateman/d1c1bdccda7405f1c6ed to your computer and use it in GitHub Desktop.
Slovakian settlement DB scraper
require 'json'
require 'open-uri'
require 'nokogiri'
require 'ruby-progressbar'
BASEURL = "http://adatbank.sk/telepulesek/szlovakia-telepuleseinek-listaja"
db = []
doc = Nokogiri::HTML(open(BASEURL))
pages_num = doc.css('a.page-numbers').last.children.first.to_s.to_i
puts "Falugiri - a settlement database scraper based on Nokogiri"
bar = ProgressBar.create(
:total => pages_num,
:title => "Scraping",
:format => "%t %p%%: |%B| %E",
:progress_mark => '-'
)
pages_num.times do |page|
settlements = doc.css('div.settlement-item')
settlements.each do |settlement|
main_str = settlement.css('.settlement-title span').children.first.content.strip
hu, _, sk, type = main_str.match(/^([^\/\[\]]+) (\[(.+)\] )?\/(.+)\/$/).captures
meta_str = settlement.css('.settlement-content.entry').children.first.content.strip
code, district, county, region = meta_str.match(/^Kód: (\d+), Járás: (.+), Kerület: (.+), Régió: (.+)$/).captures
lat = lng = nil
unless settlement.css('script').empty?
js_str = settlement.css('script').children.first.content
lat, lng = js_str.match(/new google\.maps\.LatLng\((\d*\.?\d*),\s?(\d*\.?\d*)/).captures
end
db << {
:code => code,
:type => type,
:title => sk ? {:hu => hu, :sk => sk} : hu,
:district => district,
:county => county,
:region => region,
:coords => {
:lat => lat,
:lng => lng
}
}
end
doc = Nokogiri::HTML(open("#{BASEURL}/page/#{page + 2}")) if page < pages_num - 1
bar.increment
end
File.open('db.json', 'w') do |f|
f.write JSON.pretty_generate(db)
puts "Saved to #{f.path}!"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment