Created
October 27, 2020 00:18
-
-
Save Snarp/d324a3130a12f8d8d9fd1be0bc985336 to your computer and use it in GitHub Desktop.
Given a Google Drive folder URL, attempts to extract a list of Google Doc names and IDs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'yaml' | |
require 'faraday' | |
require 'nokogiri' | |
# REVIEW: May not work for directories with larger numbers of files. | |
# REVIEW: Formatting will likely be different for non-Google-Docs files. | |
# REVIEW: This is stupid. | |
def scrape_folder_contents(gdrive_folder_url, output_fname=nil) | |
html = Faraday.get(gdrive_folder_url).body | |
doc = Nokogiri::HTML(html) | |
js = nil | |
doc.css('script').each do |script| | |
_js = script.inner_html.strip | |
if _js.include?("window['_DRIVE_ivd']") | |
js = _js | |
break | |
end | |
end | |
return nil unless js | |
# Clean off cruft to get raw array describing folder contents. | |
js.sub!("window['_DRIVE_ivd'] = '",'') | |
js.sub!("';if (window['_DRIVE_ivdc']) {window['_DRIVE_ivdc']();}", '') | |
# Clean up array formatting so Ruby can evaluate it. | |
js = convert_hex_chars(js) | |
js.gsub!('\n',' ') | |
js.gsub!('\/', '/') | |
# Remove all null values | |
# REVIEW: May need to skip this when trying to parse non-Google-Docs files | |
js.gsub!(',null','') | |
# Evaluating the data contained in this Javascript gives an array of 3 | |
# items, the first of which will be an array of subarrays, each describing | |
# one document within the GDrive folder. | |
data = eval(js) | |
doc_arr = data.first | |
folder_contents = doc_arr.map do |subarr| | |
id = subarr[0] # The 1st element in each doc array is the doc's ID | |
title = subarr[2] # The 3rd is its title | |
{ | |
title: title, | |
id: id, | |
} | |
end | |
File.write(output_fname, folder_contents.to_yaml) if output_fname | |
return folder_contents | |
end | |
def convert_hex_chars(str) | |
chars = str.scan(/\\x(..)/).flatten.uniq | |
chars.each do |char| | |
str.gsub!("\\x#{char}", hex_to_unicode(char)) | |
end | |
return str | |
end | |
def hex_to_unicode(str='\x5b') | |
str.sub('\x','').hex.chr('UTF-8') | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment