Created
May 16, 2010 17:12
-
-
Save afiore/403011 to your computer and use it in GitHub Desktop.
Retrieves events from wikipedia's Day Of the Year section
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Wikievents: | |
# | |
# Retrieves all English Wikipedia's Day of the Year pages (http://en.wikipedia.org/wiki/Category:Days_of_the_year). | |
# Once downloaded the pages on disk, the script scrapes all events, births, deaths, and observances from each page and populates a mysql table | |
# called wikievents. | |
# | |
# | |
# | |
# USAGE: | |
# - set your mysql connection parameters in task :db_connect | |
# - run "rake wiki:download_events" | |
# - run "rake wiki:import_events" | |
# | |
# | |
# Author: Andrea Fiore/ | |
# Mailto: and{AT}inventati.org | |
# | |
# | |
# coding: utf-8 | |
require 'net/http' | |
require 'fileutils' | |
require 'rubygems' | |
require 'hpricot' | |
require 'mysql' | |
def wiki_parse(path) | |
puts "Scraping events from #{path}" | |
doc=Hpricot(File.open("events/#{path}")) | |
event_count=0 | |
['Events','Births','Deaths','Holidays_and_observances'].each do |selector| | |
elm=doc.search("##{selector}").first | |
elm.parent.next_sibling.search('li').each do |li| | |
event=li.to_plain_text()#.gsub(/\[[^\]]*\]/,' ').gsub(/\s(\s)+/,'') | |
event,year = event.split(/[–]\s?/,2).reverse() | |
month,day = path.split('_',2) | |
sql=$db.prepare("INSERT INTO events (event, day, month, year, type) VALUES (?,?,?,?,?);") | |
sql.execute( event.gsub(Regexp.new('^\200\223','',nil),''), day.to_i, month, year, selector.downcase) | |
event_count+=1 | |
end | |
puts "#{event_count} Events scraped." | |
end | |
end | |
def wiki_getpage(path,max_exec=5,retries=2) | |
puts "GET /wiki/#{path}" | |
res = Net::HTTP.start('en.wikipedia.org', 80) {|http| | |
http.read_timeout=4 | |
http.get("/wiki/#{path}") | |
} | |
if res.code.to_i == 200 | |
File.open("events/#{path}",'w') do |f| | |
f.write(res.body) | |
end | |
puts "got #{path}" | |
end | |
end | |
def calendar_iterate() | |
Date::MONTHNAMES.each do |month| | |
next unless month | |
(1..31).each do |day| | |
break if day == 30 and month == 'February' | |
break if day == 31 and ['November','April','June', 'September'].include?(month) | |
yield(month,day) | |
end | |
end | |
end | |
task :mkdir_events do | |
FileUtils.mkdir('events') unless File.exists?('events') | |
end | |
task :db_connect do | |
begin | |
$db = Mysql.connect('localhost', 'wikieventsuser', 'wikieventspassword', 'wikievents') | |
$db.query("DROP TABLE IF EXISTS events;") | |
$db.query <<EOD | |
CREATE TABLE events( | |
eid INT NOT NULL AUTO_INCREMENT, | |
PRIMARY KEY(eid), | |
event VARCHAR(255), | |
day INT(2), | |
month VARCHAR(255), | |
year YEAR, | |
type VARCHAR(255) | |
); | |
EOD | |
rescue Mysql::Error | |
puts "Oh noes! We could not connect to our database.. " | |
exit 1 | |
end | |
end | |
namespace :wiki do | |
desc "Retrieve all Wikipedia's entry in the category Days of the year." | |
task :download_events => 'mkdir_events' do | |
calendar_iterate {|month,day| wiki_getpage([month,day].join('_')) } | |
end | |
desc "Scrape events from the downloaded pages and importes them into a local database table." | |
task :import_events => [:db_connect] do | |
calendar_iterate {|month,day| wiki_parse([month,day].join('_')); sleep(0.1)} | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment