Created
May 2, 2012 13:09
-
-
Save jdar/2576428 to your computer and use it in GitHub Desktop.
scrape flashcards
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Rakefile | |
require 'open-uri' | |
require 'nokogiri' | |
require 'fileutils' | |
desc "scrape a url or file for table rows. lightly hard-coded for WOLD" | |
task :scrape, :page, :table_container_id, :header_rows do |t, args| | |
page = nil | |
page = Dir[args[:page]].first if args[:page].include?("*") | |
file = open(page || args[:page]) | |
document = Nokogiri::HTML(file) | |
FileUtils.mkdir_p("vocab") | |
rows = document.search("//div[@id='#{args[:table_container_id]||"words-container"}']//tr") | |
(args[:header_rows].to_i||0).times { rows.shift } # get rid of header | |
@length = rows.length | |
binned = Hash.new {|h,k| h[k]=[] } | |
for i in (0..(@length-1)) | |
row = rows[i].children | |
values = [row[0],row[4]].map{|el| el.text.strip } | |
values << row[6].text.to_i.to_s | |
bin = (i / 20).round * 20 | |
binned[bin] << {:word=>values[1].split(/\s/).last, :parsed=>values.join("|")} | |
end | |
t = Time.now.to_i | |
for bin, rows in binned | |
word = rows[0][:word].gsub(/\W/,"") | |
File.open("vocab/#{t}_#{bin}_#{word}", "w+") do |f| | |
rows.each {|r| f.puts(r[:parsed]) } | |
end | |
end | |
puts "parsed: #{@length}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment