mejibyte · November 24, 2010 00:07 · nhocki · Nov 24, 2010 · mejibyte · Nov 25, 2010
diff --git a/gistfile1.rb b/gistfile1.rb
 # -*- coding: utf-8 -*-

 # This script will download the meaning of all names found on http://www.misabueso.com/nombres/
 # Date: July 20, 2009, at 2:14AM
 # Author: Andrés Mejía <[email protected]>

 # The format of the output file is as follows.
 # For each name fetched, there are two lines:
 # Line 1: The name itself
 # Line 2: A valid HTML chunk containing the meaning of the name and some other stuff

 require 'rubygems'
 require 'hpricot'
 require 'open-uri'
 require 'iconv'

 def element_from_url(url)
  repeat = true
  while repeat
    repeat = false
    begin
      f = open(url)
      doc = Hpricot(Iconv.conv('UTF-8', f.charset, f.read))
      f.close
    rescue Exception => e
      print "Exception caught: "
      puts e.inspect
      puts "Retrying in 1800 seconds..."
      sleep(1800)
      repeat = true
    end
  end
  doc
 end

 def do_name(name, url, output_file)
  puts "  About to fetch '#{url}'..."
  doc = element_from_url(url)
  puts "   received."

  header = doc.search("html > body > div#contenido > div.incont > ul.li1").first
  header = header.inner_html.gsub(/[\n\t\r]+/, ' ').gsub(/<li>/, '').gsub(/<\/li>/, '')

  content = doc.search("html > body > div#contenido > div.incont > div > div.r3 > div.inr3").first
  content = content.inner_html.gsub(/[\n\t\r]+/, ' ').gsub(/h6/, "h3")

  if content && header
    output_file.puts name
    output_file.puts "<div class=\"name_meaning\">#{header}</div> <div class=\"name_analysis\">#{content}</div>"
  else
    puts "[ERROR] Problems fetching. Name = #{name}, content = #{content}, header = #{header}"
  end
  sleep(20)
 end

 def do_names_that_start_with(letter)
  url = "http://www.misabueso.com/nombres/nombre_#{letter.upcase}.html"
  puts "About to fetch '#{url}'..."
  doc = element_from_url(url)
  puts " received."

  output_file = File.new(letter + ".txt", "w")
  links = doc.search("a") # finds all links
  links.each do |link|
    uri = link["href"]
    name = link.inner_html
    if uri and Regexp.new("nombre_.{2,}\.html").match(uri)
      do_name(name, "http://www.misabueso.com/nombres/" + uri, output_file)
    end
  end
  output_file.close
 end


 def do_everything
  "A".upto "Z" do |letter|
    do_names_that_start_with(letter)
  end
 end

 do_everything
	# -- coding: utf-8 --

	# This script will download the meaning of all names found on http://www.misabueso.com/nombres/
	# Date: July 20, 2009, at 2:14AM
	# Author: Andrés Mejía <[email protected]>

	# The format of the output file is as follows.
	# For each name fetched, there are two lines:
	# Line 1: The name itself
	# Line 2: A valid HTML chunk containing the meaning of the name and some other stuff

	require 'rubygems'
	require 'hpricot'
	require 'open-uri'
	require 'iconv'

	def element_from_url(url)
	repeat = true
	while repeat
	repeat = false
	begin
	f = open(url)
	doc = Hpricot(Iconv.conv('UTF-8', f.charset, f.read))
	f.close
	rescue Exception => e
	print "Exception caught: "
	puts e.inspect
	puts "Retrying in 1800 seconds..."
	sleep(1800)
	repeat = true
	end
	end
	doc
	end

	def do_name(name, url, output_file)
	puts " About to fetch '#{url}'..."
	doc = element_from_url(url)
	puts " received."

	header = doc.search("html > body > div#contenido > div.incont > ul.li1").first
	header = header.inner_html.gsub(/[\n\t\r]+/, ' ').gsub(/<li>/, '').gsub(/<\/li>/, '')

	content = doc.search("html > body > div#contenido > div.incont > div > div.r3 > div.inr3").first
	content = content.inner_html.gsub(/[\n\t\r]+/, ' ').gsub(/h6/, "h3")

	if content && header
	output_file.puts name
	output_file.puts "<div class=\"name_meaning\">#{header}</div> <div class=\"name_analysis\">#{content}</div>"
	else
	puts "[ERROR] Problems fetching. Name = #{name}, content = #{content}, header = #{header}"
	end
	sleep(20)
	end

	def do_names_that_start_with(letter)
	url = "http://www.misabueso.com/nombres/nombre_#{letter.upcase}.html"
	puts "About to fetch '#{url}'..."
	doc = element_from_url(url)
	puts " received."

	output_file = File.new(letter + ".txt", "w")
	links = doc.search("a") # finds all links
	links.each do \|link\|
	uri = link["href"]
	name = link.inner_html
	if uri and Regexp.new("nombre_.{2,}\.html").match(uri)
	do_name(name, "http://www.misabueso.com/nombres/" + uri, output_file)
	end
	end
	output_file.close
	end


	def do_everything
	"A".upto "Z" do \|letter\|
	do_names_that_start_with(letter)
	end
	end

	do_everything
No results found