Skip to content

Instantly share code, notes, and snippets.

@burningtree
Created April 8, 2013 03:44
Show Gist options
  • Select an option

  • Save burningtree/5334104 to your computer and use it in GitHub Desktop.

Select an option

Save burningtree/5334104 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/ruby
# encoding: utf-8
require 'rubygems'
require 'open-uri'
require 'digest/sha1'
require 'mysql'
require 'iconv'
require 'nokogiri'
require 'unicode'
db = Mysql.connect('localhost','user','password','db')
db.options(Mysql::SET_CHARSET_NAME, 'utf8')
db.query('SET NAMES utf8')
def addslashes(str)
str.gsub(/['"\\\x0]/,'\\\\\0')
end
def get_url(url)
cache_dir = 'cache/'
hash = Digest::SHA1.hexdigest(url)
filename = cache_dir + hash
if !File::exists?(filename)
f = File.new(filename, 'w+')
file = open(url)
f.write file.read
f.close
#`iconv -f latin2 -t utf-8 #{filename} > #{filename}.conv`
end
file = File.open(filename,'r:UTF-8').read
return Nokogiri::HTML(file)
end
base_url = 'http://volby.cz/pls/kv2010/'
url = base_url+'kv22?xjazyk=CZ&xid=1&xv=21'
doc = get_url(url)
doc.css('table tr').each do |row|
r = row.css('td')
next if !r
i = 0
tds = r.css('td')
next if !tds[0]
region_code = tds[0].content
region_name = tds[1].content
region_url = base_url+tds[2].css('a')[0].attribute('href').content
region = get_url(region_url)
region_kraj = region.css('h3')[1].content.match(/Kraj: (.+)/)[1]
print "#{region_name} / #{region_kraj}\n"
region.css('table tr').each do |rr|
td = rr.css('td')
next if td.size<1
obce_code = td[0].content
obce_name = td[1].content
obce_url = (base_url+td[0].css('a')[0].attribute('href').content+'&xstrana=0').gsub(/kv2211/,'kv21111')
strany_url = (base_url+td[0].css('a')[0].attribute('href').content)
print obce_name+"\n => "
strany = get_url(strany_url)
strany.css('.tabulka1 table tr').each do |srr|
next if srr.css('a').size<1
obce_url = (base_url+srr.css('a')[0].attribute('href').content)
next if obce_url.match(/xstrana=0/)
strana = srr.css('a')[0].content
obce = get_url(obce_url)
cands_size = 0
obce.css('table tr').each do |orr|
tds = orr.css('td')
next if tds.size < 1
xobec = obce_url.match(/xobec=(\d+)/)[1]
if tds.size == 14
candidate = {
:obvod => tds[0].content,
:party_pos => tds[1].content,
:party_kand => tds[2].content,
:pos => tds[3].content,
:name => addslashes(tds[4].content),
:age => tds[5].content,
:party_navrh => tds[6].content,
:party_prisl => tds[7].content,
:work => tds[8].content,
:home => tds[9].content,
:votes => tds[10].content.gsub(/\302\240/,''),
:votes_perc => tds[11].content.gsub(/,/,'.').to_f,
:elected_pos => tds[12].content,
:obce => obce_name,
:obce_id => xobec,
:region => region_name,
:kraj => region_kraj,
}
elsif tds.size == 11
candidate = {
:obvod => 0,
:pos => tds[0].content,
:name => addslashes(tds[1].content),
:age => tds[2].content,
:party_navrh => tds[3].content,
:party_prisl => tds[4].content,
:work => tds[5].content,
:home => tds[6].content,
:votes => tds[7].content.gsub(/\302\240/,''),
:votes_perc => tds[8].content.gsub(/,/,'.').to_f,
:elected_pos => tds[9].content,
:obce => obce_name,
:obce_id => xobec,
:region => region_name,
:kraj => region_kraj,
:party_kand => strana,
}
end
candidate[:region_nuts] = obce_url.match(/xnumnuts=(\d+)/)[1]
name_expl = candidate[:name].split(' ')
if name_expl.size>2
titles = []
titles_allow = [ 'PhDr.', 'Bc.', 'Ing.', 'Mgr.', 'JUDr.', 'Ph.D.', 'MBA', 'Ph.D.', 'RNDr.', 'Doc.', 'CSc.', 'DrSc.', 'MUDr.', 'MgA.', 'D.', 'DSc.', 'CSc', 'doc.', 'Prof.', 'PaedDr.', 'ThLic.', 'prof.', 'ak.arch.', 'ak.mal.',
'akad.mal.', 'Akad.soch.', 'B.A.', 'BBA', 'BBs.', 'Bc.', 'Bc.et', 'BcA.', 'BSc.', 'CIM,DMS', 'CSc.', 'D.S', 'dipl.mngm.', 'dipl.um.', 'dipl.um.,DiS.', 'diplom.ekonom', 'DiS', 'DiS.', 'Bc.,', 'CSc.,', 'Dis.', 'doc.,Ing.', 'Doc.Ing.', 'doc.MUDr', 'doc.PhDr.', 'Doc.RNDr.', 'Dr.', 'Dr.Ing.', 'ek.', 'et', 'FESC', 'ing.', 'Ing.,', 'Ing.arch.', 'JUDr.,', 'LL.M.', 'M.A.', 'M.EM', 'M.Sc.', 'Mag.phil.', 'MBA.', 'Mgr.,', 'Mgr.at', 'Mgr.Bc.', 'Mgr.et', 'Mgr.Ing.', 'MIB', 'MPA', 'MSc.', 'MSc.Ph.D.', 'MUDr.,', 'MVDr.', 'Paed.Dr.', 'Ph.', 'Ph.D', 'Ph.D.,', 'PharmDr.', 'PharmDr.MUDr.', 'PhD.', 'PhDr.,', 'Prof.Ing.', 'Prof.MUDr.', 'prof.RNDr.', 'prom.biol.', 'RNDr.,', 'RSDr.', 'RSDr.,', 'soch.', 'Th.D.', 'ThDr.', 'um.' , 'doc.MUDr.', 'Th.Mgr.', 'doc.MUDr.', 'ING.', 'Dipl.', 'FICS.', 'Doc.MUDr.', 'dipl.', 'JUDr.et', 'Ing.Bc.', 'SCS.Lic.', 'doc.Ing.', 'Ph.D.,MBA', 'CSc.,dr.h.c.', 'PaeDr.', 'MSc.,', 'MUDr.Bc.', 'Ing.et.Ing.', 'Ph.Dr.', 'doc.Mgr.', 'et.', 'Dipl.-Kfm.', 'SCS.Lic.', 'doc.Ing.', 'diplomov.ekonom', 'MBA,', 'Lic.', 'MgrA.', 'JUDr.PhDr.', 'inž.', 'ekonom', 'Plk.', 'Prim.', 'Doc.PhDr.' 'arch.', 'ak.', 'AKAD.ARCH.,', 'Doc.MUDr.', 'dipl.', 'Pharm.', 'ml.', 'PhD', 'MgA.,Mgr.', 'Ing.Bc.', 'JUDr.et', 'Doc.Dr.', 'PhDr.Mgr.', 'ThMgr.', 'Pharm.Dr.', 'Ing.Mgr.', 'Ing.,Ph.D.', 'M.B.A.', 'Mgr.A.', 'arch.', 'Dipl.Ing.', 'Dipl.um.', 'diplom.', 'diplom.umělkyně', 'Ing', 'Ing.,Mgr.', 'Ing.Ing.', 'LL.A.', 'mal.', 'malíř', 'MDDr.', 'MgA', 'PHDr.', 'prof.PhDr.', 'Ak.', 'Akad.', 'Ba', 'Doc.,', 'Doc.,mudr.', 'Doc.phdr.', 'Npor.',
]
ni = 0
name_expl[2..(name_expl.size+1)].each do |t|
ni += 1
if titles_allow.include?t
titles << t
name_expl.delete(t)
end
end
candidate[:title] = titles.join(';')
end
old_firstname = candidate[:firstname]
candidate[:firstname] = Unicode::capitalize(Unicode::downcase(name_expl.last))
name_expl.delete_at(name_expl.size-1)
candidate[:surname] = name_expl.map{|word| Unicode::capitalize(Unicode::downcase(word)) }.join(" ")
cands_size = cands_size+1
q = "insert into ei_kv2010 ("+candidate.map { |key,val| "`"+key.to_s+"`" }.join(',')+") values ("+candidate.map { |key,val| "'"+val.to_s+"'" }.join(',')+")"
begin
db.query(q)
print '.'
rescue Exception => e
if e.to_s.match(/Duplicate entry/)
print 'd'
else
p e; exit
end
end
end
volby_size = obce.css('h3').last.content.match(/:\302\240\302\240(\d+)/)[1].to_i
if volby_size != cands_size
print "chyba! nesouhlasi pocet importovanych zastupitelu!! #{volby_size} vs #{cands_size} "
exit
end
end
puts ''
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment