Created
August 1, 2013 21:08
-
-
Save evandonovan/6135344 to your computer and use it in GitHub Desktop.
Web crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Task class | |
require 'mysql' | |
module OrgSocGraph | |
FIELDS = { | |
"orgs.csv" => [ | |
Class.new(Object) do | |
def name | |
:description | |
end | |
end.new | |
] | |
} | |
START_JOBS = [ | |
Class.new(BaseJob) do | |
# parent job doesn't do anything | |
def url | |
"http://www.example.com" | |
end | |
def get_children(doc) | |
# do MySQL query to get the URLs of the child jobs | |
dbh = MySQL.real_connect("hostname", "dbuser", "password", "database") | |
# limit to 10 for test run | |
res = dbh.query('SELECT website_url FROM tbl_organizations WHERE website_url != "" LIMIT 10') | |
res.each_row do |r| | |
Class.new(BaseJobWithUrl) do | |
def url | |
url | |
end | |
def execute(doc, data_store, fields) | |
# crawl for meta description | |
data_store.add_item("orgs.csv", [ | |
self.url, | |
doc.css("meta[name='description']").first | |
]) | |
end | |
end.new(r["website_url"]) | |
end | |
end | |
end | |
] | |
end | |
## Job Class | |
class BaseJob | |
def document | |
doc = nil | |
begin | |
doc = Nokogiri::HTML(open(url)) | |
rescue | |
puts "problem opening uri" | |
end | |
doc | |
end | |
def execute(doc, data_store, fields) | |
end | |
def get_children(doc) | |
[] | |
end | |
end | |
class BaseJobWithURL < BaseJob | |
attr_accessor :url | |
def initialize(url) | |
@url = url | |
end | |
end | |
## main Ruby script (main.rb) | |
# for compatibility with 1.8.x require rubygems | |
require 'rubygems' | |
require 'open-uri' | |
# 1.8.x requires <= 1.5.0 of Nokogiri | |
require 'nokogiri' | |
require 'csv' | |
require 'mechanize' | |
Dir[File.dirname(__FILE__) + '/lib/*.rb'].each {|file| require file } | |
Dir[File.dirname(__FILE__) + '/tasks/*.rb'].each {|file| require file } | |
ARGV.each do |mod| | |
jobs = eval("#{mod}::START_JOBS") | |
fields = eval("#{mod}::FIELDS") | |
je = JobExecutor.new(fields) | |
je.add_jobs(jobs) | |
je.run | |
fields.each_pair do |file, columns| | |
CSV.open("output/#{file}", "wb") do |csv| | |
csv << ['source_url'] + columns.map{|c| c.name.to_s} | |
for record in je.data_store.get_items(file) | |
csv << record.map{|r| HTMLCleaning::clean(r.to_s, :convert_to_plain_text => true)} | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment