Skip to content

Instantly share code, notes, and snippets.

@evandonovan
Created August 1, 2013 21:08
Show Gist options
  • Save evandonovan/6135344 to your computer and use it in GitHub Desktop.
Save evandonovan/6135344 to your computer and use it in GitHub Desktop.
Web crawler
## Task class
require 'mysql'
module OrgSocGraph
FIELDS = {
"orgs.csv" => [
Class.new(Object) do
def name
:description
end
end.new
]
}
START_JOBS = [
Class.new(BaseJob) do
# parent job doesn't do anything
def url
"http://www.example.com"
end
def get_children(doc)
# do MySQL query to get the URLs of the child jobs
dbh = MySQL.real_connect("hostname", "dbuser", "password", "database")
# limit to 10 for test run
res = dbh.query('SELECT website_url FROM tbl_organizations WHERE website_url != "" LIMIT 10')
res.each_row do |r|
Class.new(BaseJobWithUrl) do
def url
url
end
def execute(doc, data_store, fields)
# crawl for meta description
data_store.add_item("orgs.csv", [
self.url,
doc.css("meta[name='description']").first
])
end
end.new(r["website_url"])
end
end
end
]
end
## Job Class
class BaseJob
def document
doc = nil
begin
doc = Nokogiri::HTML(open(url))
rescue
puts "problem opening uri"
end
doc
end
def execute(doc, data_store, fields)
end
def get_children(doc)
[]
end
end
class BaseJobWithURL < BaseJob
attr_accessor :url
def initialize(url)
@url = url
end
end
## main Ruby script (main.rb)
# for compatibility with 1.8.x require rubygems
require 'rubygems'
require 'open-uri'
# 1.8.x requires <= 1.5.0 of Nokogiri
require 'nokogiri'
require 'csv'
require 'mechanize'
Dir[File.dirname(__FILE__) + '/lib/*.rb'].each {|file| require file }
Dir[File.dirname(__FILE__) + '/tasks/*.rb'].each {|file| require file }
ARGV.each do |mod|
jobs = eval("#{mod}::START_JOBS")
fields = eval("#{mod}::FIELDS")
je = JobExecutor.new(fields)
je.add_jobs(jobs)
je.run
fields.each_pair do |file, columns|
CSV.open("output/#{file}", "wb") do |csv|
csv << ['source_url'] + columns.map{|c| c.name.to_s}
for record in je.data_store.get_items(file)
csv << record.map{|r| HTMLCleaning::clean(r.to_s, :convert_to_plain_text => true)}
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment