afiore · May 16, 2010 17:12
diff --git a/Rakefile b/Rakefile
 # Wikievents: 
 #
 #  Retrieves all English Wikipedia's Day of the Year pages (http://en.wikipedia.org/wiki/Category:Days_of_the_year).
 #  Once downloaded the pages on disk, the script scrapes all events, births, deaths, and observances from each page and populates a mysql table 
 #  called wikievents.
 #
 #
 #  
 #  USAGE:
 #   - set your mysql connection parameters in task :db_connect   
 #   - run "rake wiki:download_events"
 #   - run "rake wiki:import_events"
 # 
 #  
 #  Author: Andrea Fiore/ 
 #  Mailto: and{AT}inventati.org
 #  
 #
 # coding: utf-8


 require 'net/http'
 require 'fileutils'
 require 'rubygems'
 require 'hpricot'
 require 'mysql'


 def wiki_parse(path)
   
   puts "Scraping events from #{path}"
   doc=Hpricot(File.open("events/#{path}"))
   event_count=0

   ['Events','Births','Deaths','Holidays_and_observances'].each do |selector|
     elm=doc.search("##{selector}").first    
     elm.parent.next_sibling.search('li').each do |li|
       
       event=li.to_plain_text()#.gsub(/\[[^\]]*\]/,' ').gsub(/\s(\s)+/,'')
       event,year  = event.split(/[–]\s?/,2).reverse()
       month,day  =  path.split('_',2)
       sql=$db.prepare("INSERT INTO events (event, day, month, year, type) VALUES (?,?,?,?,?);")
       sql.execute( event.gsub(Regexp.new('^\200\223','',nil),''), day.to_i, month, year, selector.downcase)        
       event_count+=1
     end
   puts "#{event_count} Events scraped."
   end
 end

 def wiki_getpage(path,max_exec=5,retries=2)
  
   puts "GET /wiki/#{path}"
   res = Net::HTTP.start('en.wikipedia.org', 80) {|http| 
     http.read_timeout=4
     http.get("/wiki/#{path}")
   }

   if res.code.to_i == 200
      File.open("events/#{path}",'w') do |f|
        f.write(res.body)
      end
      puts "got #{path}"
   end
 end

 def calendar_iterate()
    Date::MONTHNAMES.each do |month|
      next unless month

      (1..31).each do |day|
       break if day == 30 and month == 'February'
       break if day == 31 and ['November','April','June', 'September'].include?(month) 
        yield(month,day)
      end
     end
 end


 task :mkdir_events do 
    FileUtils.mkdir('events') unless File.exists?('events')
 end

 task :db_connect do 
  begin
    $db = Mysql.connect('localhost', 'wikieventsuser', 'wikieventspassword', 'wikievents')
    $db.query("DROP TABLE IF EXISTS events;")
    $db.query <<EOD

      CREATE TABLE events( 
        eid INT NOT NULL AUTO_INCREMENT,          
        PRIMARY KEY(eid), 
        event VARCHAR(255), 
        day INT(2), 
        month VARCHAR(255),         
        year YEAR,
        type VARCHAR(255)
      ); 
    
 EOD
  rescue Mysql::Error
    puts "Oh noes! We could not connect to our database.. "
    exit 1
  end

 end

 namespace :wiki do   
  desc "Retrieve all Wikipedia's entry in the category Days of the year." 
  task :download_events => 'mkdir_events'  do 
    calendar_iterate {|month,day| wiki_getpage([month,day].join('_')) } 
  end

  desc "Scrape events from the downloaded pages and importes them into a local database table."
  task :import_events => [:db_connect] do 
    calendar_iterate {|month,day| wiki_parse([month,day].join('_')); sleep(0.1)}  
  end
 end
	# Wikievents:
	#
	# Retrieves all English Wikipedia's Day of the Year pages (http://en.wikipedia.org/wiki/Category:Days_of_the_year).
	# Once downloaded the pages on disk, the script scrapes all events, births, deaths, and observances from each page and populates a mysql table
	# called wikievents.
	#
	#
	#
	# USAGE:
	# - set your mysql connection parameters in task :db_connect
	# - run "rake wiki:download_events"
	# - run "rake wiki:import_events"
	#
	#
	# Author: Andrea Fiore/
	# Mailto: and{AT}inventati.org
	#
	#
	# coding: utf-8


	require 'net/http'
	require 'fileutils'
	require 'rubygems'
	require 'hpricot'
	require 'mysql'


	def wiki_parse(path)

	puts "Scraping events from #{path}"
	doc=Hpricot(File.open("events/#{path}"))
	event_count=0

	['Events','Births','Deaths','Holidays_and_observances'].each do \|selector\|
	elm=doc.search("##{selector}").first
	elm.parent.next_sibling.search('li').each do \|li\|

	event=li.to_plain_text()#.gsub(/\[[^\]]*\]/,' ').gsub(/\s(\s)+/,'')
	event,year = event.split(/[–]\s?/,2).reverse()
	month,day = path.split('_',2)
	sql=$db.prepare("INSERT INTO events (event, day, month, year, type) VALUES (?,?,?,?,?);")
	sql.execute( event.gsub(Regexp.new('^\200\223','',nil),''), day.to_i, month, year, selector.downcase)
	event_count+=1
	end
	puts "#{event_count} Events scraped."
	end
	end

	def wiki_getpage(path,max_exec=5,retries=2)

	puts "GET /wiki/#{path}"
	res = Net::HTTP.start('en.wikipedia.org', 80) {\|http\|
	http.read_timeout=4
	http.get("/wiki/#{path}")
	}

	if res.code.to_i == 200
	File.open("events/#{path}",'w') do \|f\|
	f.write(res.body)
	end
	puts "got #{path}"
	end
	end

	def calendar_iterate()
	Date::MONTHNAMES.each do \|month\|
	next unless month

	(1..31).each do \|day\|
	break if day == 30 and month == 'February'
	break if day == 31 and ['November','April','June', 'September'].include?(month)
	yield(month,day)
	end
	end
	end


	task :mkdir_events do
	FileUtils.mkdir('events') unless File.exists?('events')
	end

	task :db_connect do
	begin
	$db = Mysql.connect('localhost', 'wikieventsuser', 'wikieventspassword', 'wikievents')
	$db.query("DROP TABLE IF EXISTS events;")
	$db.query <<EOD

	CREATE TABLE events(
	eid INT NOT NULL AUTO_INCREMENT,
	PRIMARY KEY(eid),
	event VARCHAR(255),
	day INT(2),
	month VARCHAR(255),
	year YEAR,
	type VARCHAR(255)
	);

	EOD
	rescue Mysql::Error
	puts "Oh noes! We could not connect to our database.. "
	exit 1
	end

	end

	namespace :wiki do
	desc "Retrieve all Wikipedia's entry in the category Days of the year."
	task :download_events => 'mkdir_events' do
	calendar_iterate {\|month,day\| wiki_getpage([month,day].join('_')) }
	end

	desc "Scrape events from the downloaded pages and importes them into a local database table."
	task :import_events => [:db_connect] do
	calendar_iterate {\|month,day\| wiki_parse([month,day].join('_')); sleep(0.1)}
	end
	end