Skip to content

Instantly share code, notes, and snippets.

@javierarce
Created May 4, 2011 11:23
Show Gist options
  • Save javierarce/955087 to your computer and use it in GitHub Desktop.
Save javierarce/955087 to your computer and use it in GitHub Desktop.
INE scraper
require 'rubygems'
require "pp"
require 'capybara'
require 'capybara/dsl'
Capybara.run_server = false
Capybara.current_driver = :selenium
Capybara.app_host = 'http://www.ine.es'
Capybara.default_wait_time = 10000
module Ine
class Scraper
include Capybara
def init
year = ARGV[0] || 2009
puts "Analizando #{year}"
puts "---------------"
filename = "edad_#{year}_#{Time.now.to_i}.csv"
File.open(filename, 'w') do |f|
f.write("id,provincia_id, municipio_id,nombre,total,2,7,12,17,22,27,32,37,42,47,52,57,62,67,72,77,82,87"+ "\n")
end
for i in 1..52
p = i <= 9 ? "0#{i}" : i
provincias = ['Álava', 'Albacete', 'Alicante', 'Almería', 'Ávila', 'Badajoz', 'Islas Baleares', 'Barcelona', 'Burgos', 'Cáceres', 'Cádiz', 'Castellón', 'Ciudad Real', 'Córdoba', 'A Coruña', 'Cuenca', 'Girona', 'Granada', 'Guadalajara', 'Guipúzcoa', 'Huelva', 'Huesca', 'Jaén', 'León', 'Lleida', 'Rioja(La)', 'Lugo', 'Madrid', 'Málaga', 'Murcia', 'Navarra', 'Ourense', 'Asturias', 'Palencia', 'Las Palmas', 'Pontevedra', 'Salamanca', 'Santa Cruz de Tenerife', 'Cantabria', 'Segovia', 'Sevilla', 'Soria', 'Tarragona', 'Teruel', 'Toledo', 'Valencia', 'Valladolid', 'Vizcaya', 'Zamora', 'Zaragoza', 'Ceuta', 'Melilla']
puts "#{p} #{provincias[i - 1]}"
url = "http://www.ine.es/jaxi/tabla.do?path=/t20/e245/p05/a#{year}/l0/&file=000#{p}001.px&type=pcaxis&L=0"
visit url
select("Ambos sexos", :from => "cri1")
page.execute_script("todos(true,'cri2','sel_2')")
page.execute_script("todos(true,'cri3','sel_3')")
click_on('consulsele')
within("div#capaTabla td.tableData table") do
td = Array.new
all("tr").each do |tr|
td.push(tr.text)
end
File.open(filename, 'a') do |f|
td.each do |l|
if cols = /^(\d\d\d\d\d).(.*?)\s(\d.*)/.match(l)
identifier = /(\d\d)(\d+)/.match(cols[1])
f.write("\"#{cols[1]}\",\"#{identifier[1]}\",\"#{identifier[2]}\",#{cols[2]},#{cols[3].split(" ").join(",")}\n")
end
end
end
end
end
end
end
end
scraper = Ine::Scraper.new
scraper.init
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment