Created
June 26, 2013 19:04
-
-
Save dhoss/5870441 to your computer and use it in GitHub Desktop.
simple web scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'roo' | |
require 'csv' | |
require 'mechanize' | |
require 'date' | |
# 1. Read in excel file | |
# 2. Parse and grab columns of data needed | |
# 3. Push into an array of URLs, with an associated campaign id | |
# 4. Iterate through pages, parse, and grab pertinent data | |
# 5. Append campaign ID and totals for a day to a CSV file | |
# 6. Transmit CSV somewhere. | |
class ImpressionData | |
attr_accessor :mech, | |
:csv_output, | |
:url, | |
:username, | |
:password, | |
:impression_data, | |
:spreadsheet, | |
:xls | |
def run | |
if @spreadsheet.nil? | |
usage | |
end | |
@xls = parse_xls | |
@urls = populate_urls | |
pp @urls | |
raw_impression_data | |
pp @impression_data | |
end | |
def usage | |
puts "usage: get_impression_data.rb excel.xls" | |
exit | |
end | |
# automatically instantiate parameters from new | |
def initialize(init) | |
init.each_pair do |key, val| | |
instance_variable_set('@' + key.to_s, val) | |
end | |
@impression_data = Array.new | |
@mech = Mechanize.new | |
end | |
def raw_impression_data | |
@mech.get('http://....') do |p| | |
begin | |
logged_in_page = p.form_with(:name => 'login') do |f| | |
f.username = @username | |
f.password = @password | |
end.submit | |
rescue Exception => e | |
puts "Already logged in (error: #{e}" | |
ensure | |
@urls.each do |page| | |
begin | |
row = @mech.get(page[1]).search("td.last")[1].text.strip | |
@impression_data.push( | |
[ page[0], row ] | |
) | |
rescue Exception => e | |
puts "Error retrieving data from #{page}: #{e}" | |
end | |
end | |
end | |
end | |
end | |
def save_data_to_csv | |
CSV.open(@csv_output, 'wb') do |csv| | |
csv << ["hour", "impressions", "clicks", "ctr"] | |
@impression_data.each do |data| | |
csv << [data[:hour], data[:impressions], data[:clicks], data[:ctr]] | |
end | |
end | |
end | |
def populate_urls | |
urls = Array.new | |
@xls.each do |x| | |
unless x[1].empty? | |
urls.push(x) | |
end | |
end | |
return urls | |
end | |
def parse_xls | |
xls = Array.new | |
xl = Roo::Excel.new(@spreadsheet) | |
xl.default_sheet = xl.sheets.first | |
2.upto(xl.last_row).each do |row| | |
url = Array.new | |
("K".."U").each { |l| | |
r = xl.cell(row, l) | |
if r | |
url.push(r) | |
end | |
} | |
xls.push([ | |
xl.cell(row, 'A'), | |
build_url(url) | |
]) | |
end | |
return xls | |
end | |
def build_url(url) | |
return url ? url.join("&").to_s : nil | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment