Created
October 24, 2012 14:11
-
-
Save jahan-paisley/3946269 to your computer and use it in GitHub Desktop.
Extract census geographic units of Iran from Interior Ministry Website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require 'nokogiri' | |
require 'open-uri' | |
require 'debugger' | |
require 'net/http' | |
# Get a Nokogiri::HTML::Document for the page we’re interested in... | |
# Do funky things with it using Nokogiri::XML::Node methods... | |
#### | |
# Search for nodes by css | |
file = File.open("testfile.txt", "wb") | |
uri = URI.parse 'http://portal2.moi.ir/Portal/Home/Default.aspx?CategoryID=8f931308-c67e-4cf4-a5e7-3c1bbb1a6f32' | |
http = Net::HTTP.new(uri.host, uri.port) | |
request = Net::HTTP::Get.new(uri.request_uri) | |
response = http.request(request) | |
debugger | |
doc = Nokogiri::HTML(response.body) | |
while response.code == "200" do | |
doc.css('#WebPart_989ee239_8ce2_413d_b7e0_45c9b24dcde0 table.dg tr td').each do |link| | |
file.puts(link.content) | |
end | |
form_data = doc.css('input[name*=__]','input[name*=_]', 'input[id*=__]').map{|o| [o['name'], o['value']]} + ['__EVENTARGUMENT','1'] | |
p_request = Net::HTTP::Post.new(uri.request_uri) | |
p_request.set_form_data(Hash[*form_data.flatten]) | |
response = http.request(p_request) | |
doc = response.body | |
end | |
file.close | |
#### | |
# Search for nodes by xpath | |
#doc.xpath('//h3/a').each do |link| | |
#puts link.content | |
#end | |
#### | |
# Or mix and match. | |
#doc.search('h3.r a.l', '//h3/a').each do |link| | |
#puts link.content | |
#end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment