Skip to content

Instantly share code, notes, and snippets.

@afiore
Created May 16, 2010 17:12
Show Gist options
  • Save afiore/403011 to your computer and use it in GitHub Desktop.
Save afiore/403011 to your computer and use it in GitHub Desktop.
Retrieves events from wikipedia's Day Of the Year section
# Wikievents:
#
# Retrieves all English Wikipedia's Day of the Year pages (http://en.wikipedia.org/wiki/Category:Days_of_the_year).
# Once downloaded the pages on disk, the script scrapes all events, births, deaths, and observances from each page and populates a mysql table
# called wikievents.
#
#
#
# USAGE:
# - set your mysql connection parameters in task :db_connect
# - run "rake wiki:download_events"
# - run "rake wiki:import_events"
#
#
# Author: Andrea Fiore/
# Mailto: and{AT}inventati.org
#
#
# coding: utf-8
require 'net/http'
require 'fileutils'
require 'rubygems'
require 'hpricot'
require 'mysql'
def wiki_parse(path)
puts "Scraping events from #{path}"
doc=Hpricot(File.open("events/#{path}"))
event_count=0
['Events','Births','Deaths','Holidays_and_observances'].each do |selector|
elm=doc.search("##{selector}").first
elm.parent.next_sibling.search('li').each do |li|
event=li.to_plain_text()#.gsub(/\[[^\]]*\]/,' ').gsub(/\s(\s)+/,'')
event,year = event.split(/[–]\s?/,2).reverse()
month,day = path.split('_',2)
sql=$db.prepare("INSERT INTO events (event, day, month, year, type) VALUES (?,?,?,?,?);")
sql.execute( event.gsub(Regexp.new('^\200\223','',nil),''), day.to_i, month, year, selector.downcase)
event_count+=1
end
puts "#{event_count} Events scraped."
end
end
def wiki_getpage(path,max_exec=5,retries=2)
puts "GET /wiki/#{path}"
res = Net::HTTP.start('en.wikipedia.org', 80) {|http|
http.read_timeout=4
http.get("/wiki/#{path}")
}
if res.code.to_i == 200
File.open("events/#{path}",'w') do |f|
f.write(res.body)
end
puts "got #{path}"
end
end
def calendar_iterate()
Date::MONTHNAMES.each do |month|
next unless month
(1..31).each do |day|
break if day == 30 and month == 'February'
break if day == 31 and ['November','April','June', 'September'].include?(month)
yield(month,day)
end
end
end
task :mkdir_events do
FileUtils.mkdir('events') unless File.exists?('events')
end
task :db_connect do
begin
$db = Mysql.connect('localhost', 'wikieventsuser', 'wikieventspassword', 'wikievents')
$db.query("DROP TABLE IF EXISTS events;")
$db.query <<EOD
CREATE TABLE events(
eid INT NOT NULL AUTO_INCREMENT,
PRIMARY KEY(eid),
event VARCHAR(255),
day INT(2),
month VARCHAR(255),
year YEAR,
type VARCHAR(255)
);
EOD
rescue Mysql::Error
puts "Oh noes! We could not connect to our database.. "
exit 1
end
end
namespace :wiki do
desc "Retrieve all Wikipedia's entry in the category Days of the year."
task :download_events => 'mkdir_events' do
calendar_iterate {|month,day| wiki_getpage([month,day].join('_')) }
end
desc "Scrape events from the downloaded pages and importes them into a local database table."
task :import_events => [:db_connect] do
calendar_iterate {|month,day| wiki_parse([month,day].join('_')); sleep(0.1)}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment