Created
December 12, 2012 19:03
-
-
Save arjunvenkat/4270598 to your computer and use it in GitHub Desktop.
scraper to save apprenticeship info from department of labor site
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace :apprenticeships do | |
desc "scrapes DOL site for apprenticeship data by state" | |
task :scrape => :environment do | |
require 'mechanize' | |
require 'open-uri' | |
require 'csv' | |
apprenticeships_in_state_array = [] | |
# states = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","WI","WY"] | |
states = ["WY"] | |
states.each do |state| | |
puts "State: #{state}" | |
start_row = 1 | |
url = "http://oa.doleta.gov/bat.cfm?startrow=#{start_row}&curpage=1&MaxRows=20&state=#{state}&county=all&sel=all" #string interpolates the state and start_row variables | |
agent = Mechanize.new | |
page = agent.get(url) | |
num_apprenticeships_in_state = page.search('span.boldred strong').text.to_i | |
puts "Apprenticeships in state: #{num_apprenticeships_in_state}" | |
while start_row - 1 < num_apprenticeships_in_state | |
url = "http://oa.doleta.gov/bat.cfm?startrow=#{start_row}&curpage=1&MaxRows=20&state=#{state}&county=all&sel=all" | |
puts "Current page: #{url}" | |
page = agent.get(url) | |
forms = page.forms | |
form_counter = 1 # starts at 1 instead of 0 to skip the first search form at the top of the page | |
while form_counter < forms.count - 1 | |
page = agent.submit(forms[form_counter]) | |
occupation = page.search('div#content p.boldred strong').text | |
occupation = occupation.strip() | |
n = 1 | |
sponsor_array_count = 0 | |
sponsor_array = [occupation] | |
page.search('table tr td').each do |sponsor_info| | |
unless n%5 == 0 | |
if (n-1)/5 == sponsor_array_count | |
if n%5 == 3 || n%5 == 4 | |
sponsor_array << sponsor_info.text[/\w{1,}|\s{1,}|\./] #this doesn't quite work yet, check with cities that have multi-word names | |
else | |
sponsor_array << sponsor_info.text | |
end | |
else | |
sponsor_array << url | |
apprenticeships_in_state_array << sponsor_array | |
sponsor_array = [occupation] | |
sponsor_array << sponsor_info.text | |
sponsor_array_count += 1 | |
end | |
end | |
n += 1 | |
end | |
sponsor_array << url | |
apprenticeships_in_state_array << sponsor_array | |
puts "apprenticeship stored" | |
form_counter += 1 | |
end | |
puts "#{start_row + 19} of #{num_apprenticeships_in_state} apprenticeships stored for #{state}" | |
start_row += 20 | |
end | |
puts "Completed apprenticeships for #{state}" | |
CSV.open(Rails.root + "doc/appren_files/#{state}_apprenticeships.csv", "wb") do |csv| | |
csv << ["Occupation", "Sponsor Name", "Sponsor Address", "Sponsor City", "Sponsor State", "Original Page"] | |
apprenticeships_in_state_array.each do |apprenticeship| | |
csv << apprenticeship | |
end | |
end | |
apprenticeships_in_state_array = [] | |
end | |
puts "finished scraping apprenticeships" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment