Created
August 3, 2012 19:30
-
-
Save bycoffe/3250702 to your computer and use it in GitHub Desktop.
A script for scraping the FCC's website and finding political file submissions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
class PublicFileCrawler | |
def initialize(params={}) | |
@call_sign = params[:call_sign] | |
@url = "https://stations.fcc.gov/station-profile/#{@call_sign}/political-files/browse-%3e2012" | |
@checked = {} | |
@found = {} | |
end | |
def find_links(start_url, &block) | |
open(start_url).read.scan(/(https:\/\/stations.fcc.gov\/station-profile\/#{@call_sign}\/political-files\/browse.*?)"/).flatten.each do |url| | |
url = url.downcase.gsub(/>/, '%3e') | |
next unless @checked[url].nil? | |
@checked[url] = true | |
find_pdfs(url, &block) | |
sleep(0.5) | |
find_links(url, &block) | |
end | |
end | |
def find_pdfs(url, &block) | |
page = open(url).read | |
hierarchy = Nokogiri::HTML(page).css('.browser-path a').map(&:content) | |
page.scan(/https:\/\/stations\.fcc\.gov\/collect\/files.*?\.pdf/).each do |pdf| | |
next unless @found[pdf].nil? | |
@found[pdf] = true | |
match = {:pdf_url => pdf, :hierarchy => hierarchy} | |
yield match | |
end | |
end | |
def crawl(&block) | |
find_links(@url) do |match| | |
yield match | |
end | |
end | |
end | |
call_sign = ARGV[-1] | |
unless call_sign | |
puts """ | |
Usage: ruby public_file_crawler.rb call_sign | |
e.g.: ruby public_file_crawler.rb wcpo-tv | |
""".strip | |
exit | |
end | |
require 'csv' | |
CSV.new(STDOUT) << ['pdf_url', 'hierarchy'] | |
crawler = PublicFileCrawler.new(:call_sign => ARGV[-1]) | |
crawler.crawl do |match| | |
CSV.new(STDOUT) << [match[:pdf_url], match[:hierarchy]].flatten | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment