Last active
April 4, 2016 18:56
-
-
Save troyericg/9a9354dee3f0a345cd4d81ce29d7c97a to your computer and use it in GitHub Desktop.
ruby script for downloading public/published google spreadsheets as csvs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'fileutils' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'csv' | |
############### | |
# GET PUBLIC: # A script for downloading/parsing public google spreadsheets | |
############### | |
## The key to your public spreadsheet | |
DOC_KEY = "<KEY-GOES-HERE/>" | |
## Add folder name here specify download path | |
DIR_PATH = "gp_output/" # don't forget trailing slash | |
## What each sheet of your spreadsheet will be prepended with | |
SHEET_PREFIX = "gp_sheet-" | |
## Amount of time your script will wait before trying to parse downloaded file. Adjust depending on your connection speed | |
SLEEP_AMOUNT = 20 | |
## Temporary file to be created and deleted | |
SRC_FILENAME = "gp_output_raw_tmp.html" | |
SRC_FILEPATH = "#{DIR_PATH}#{SRC_FILENAME}" | |
DOC_PATH = "https://docs.google.com/spreadsheet/pub?key=#{DOC_KEY}&single=true&ndplr=1&output=csv" | |
def init | |
puts ":: Starting parsing process ::" | |
puts ":: URL = #{DOC_PATH}" | |
puts ":: FILE PATH = #{SRC_FILEPATH}" | |
puts ":: FILE PREFIX = #{SHEET_PREFIX}" | |
puts "----------------" | |
puts | |
get_doc | |
end | |
def get_doc | |
puts ":: Downloading html ::" | |
# MAKE FOLDER IF IT DOESN'T ALREADY EXIST | |
dir_name = DIR_PATH.split("/")[0] unless DIR_PATH[-1, 1] != "/" | |
Dir.mkdir(dir_name) unless File.exists?(dir_name) | |
cmd = "wget -O #{SRC_FILEPATH} #{DOC_PATH}" | |
system cmd | |
sleep SLEEP_AMOUNT | |
puts "(Okay, file should be downloaded by now, time to start parsing...)" | |
puts | |
parse | |
end | |
def parse | |
puts ":: Parsing downloaded html ::" | |
doc = Nokogiri::HTML(open(SRC_FILEPATH)) | |
tables = doc.css('table') | |
sections = [] | |
tables.each_with_index do |table,i| | |
section = { | |
'name' => "#{SHEET_PREFIX}#{i}", | |
'rows' => table.css('tbody tr') | |
} | |
sections << section | |
end | |
make_csvs(sections) | |
end | |
def make_csvs(sections) | |
sections.each do |sect| | |
section_fname = sect['name'] | |
section_file = "#{DIR_PATH}#{section_fname}.csv" | |
unless File.exists?(section_file) | |
puts "Making #{section_fname}... " | |
csv = CSV.open(section_file, 'w',{:col_sep => ",", :quote_char => '\'', :force_quotes => true}) | |
sect["rows"].each do |row| | |
tarray = [] | |
row.css('td').each do |cell| | |
tarray << cell.text | |
end | |
csv << tarray | |
end | |
csv.close | |
end | |
puts "|_ Generated #{section_fname}.csv!" | |
end | |
puts | |
puts "(deleting source file now...)" | |
system "rm #{SRC_FILEPATH}" | |
puts | |
puts "All done!" | |
end | |
# START ME UP | |
init |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment