Skip to content

Instantly share code, notes, and snippets.

@dhoss
Created June 26, 2013 19:04
Show Gist options
  • Save dhoss/5870441 to your computer and use it in GitHub Desktop.
Save dhoss/5870441 to your computer and use it in GitHub Desktop.
simple web scraper
#!/usr/bin/env ruby
require 'rubygems'
require 'roo'
require 'csv'
require 'mechanize'
require 'date'
# 1. Read in excel file
# 2. Parse and grab columns of data needed
# 3. Push into an array of URLs, with an associated campaign id
# 4. Iterate through pages, parse, and grab pertinent data
# 5. Append campaign ID and totals for a day to a CSV file
# 6. Transmit CSV somewhere.
class ImpressionData
attr_accessor :mech,
:csv_output,
:url,
:username,
:password,
:impression_data,
:spreadsheet,
:xls
def run
if @spreadsheet.nil?
usage
end
@xls = parse_xls
@urls = populate_urls
pp @urls
raw_impression_data
pp @impression_data
end
def usage
puts "usage: get_impression_data.rb excel.xls"
exit
end
# automatically instantiate parameters from new
def initialize(init)
init.each_pair do |key, val|
instance_variable_set('@' + key.to_s, val)
end
@impression_data = Array.new
@mech = Mechanize.new
end
def raw_impression_data
@mech.get('http://....') do |p|
begin
logged_in_page = p.form_with(:name => 'login') do |f|
f.username = @username
f.password = @password
end.submit
rescue Exception => e
puts "Already logged in (error: #{e}"
ensure
@urls.each do |page|
begin
row = @mech.get(page[1]).search("td.last")[1].text.strip
@impression_data.push(
[ page[0], row ]
)
rescue Exception => e
puts "Error retrieving data from #{page}: #{e}"
end
end
end
end
end
def save_data_to_csv
CSV.open(@csv_output, 'wb') do |csv|
csv << ["hour", "impressions", "clicks", "ctr"]
@impression_data.each do |data|
csv << [data[:hour], data[:impressions], data[:clicks], data[:ctr]]
end
end
end
def populate_urls
urls = Array.new
@xls.each do |x|
unless x[1].empty?
urls.push(x)
end
end
return urls
end
def parse_xls
xls = Array.new
xl = Roo::Excel.new(@spreadsheet)
xl.default_sheet = xl.sheets.first
2.upto(xl.last_row).each do |row|
url = Array.new
("K".."U").each { |l|
r = xl.cell(row, l)
if r
url.push(r)
end
}
xls.push([
xl.cell(row, 'A'),
build_url(url)
])
end
return xls
end
def build_url(url)
return url ? url.join("&").to_s : nil
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment