Snarp · October 27, 2020 00:18
diff --git a/scrape_google_drive_folder_contents.rb b/scrape_google_drive_folder_contents.rb
 require 'yaml'
 require 'faraday'
 require 'nokogiri'

 # REVIEW: May not work for directories with larger numbers of files.
 # REVIEW: Formatting will likely be different for non-Google-Docs files.
 # REVIEW: This is stupid.
 def scrape_folder_contents(gdrive_folder_url, output_fname=nil)
  html = Faraday.get(gdrive_folder_url).body
  doc  = Nokogiri::HTML(html)

  js = nil
  doc.css('script').each do |script|
    _js = script.inner_html.strip
    if _js.include?("window['_DRIVE_ivd']")
      js = _js
      break
    end
  end
  return nil unless js

  # Clean off cruft to get raw array describing folder contents.
  js.sub!("window['_DRIVE_ivd'] = '",'')
  js.sub!("';if (window['_DRIVE_ivdc']) {window['_DRIVE_ivdc']();}", '')

  # Clean up array formatting so Ruby can evaluate it.
  js = convert_hex_chars(js)
  js.gsub!('\n',' ')
  js.gsub!('\/', '/')

  # Remove all null values
  # REVIEW: May need to skip this when trying to parse non-Google-Docs files
  js.gsub!(',null','')

  # Evaluating the data contained in this Javascript gives an array of 3 
  # items, the first of which will be an array of subarrays, each describing 
  # one document within the GDrive folder.
  data    = eval(js)
  doc_arr = data.first

  folder_contents = doc_arr.map do |subarr|
    id    = subarr[0] # The 1st element in each doc array is the doc's ID
    title = subarr[2] # The 3rd is its title

    {
      title: title, 
      id:    id, 
    }
  end

  File.write(output_fname, folder_contents.to_yaml) if output_fname
  return folder_contents
 end

 def convert_hex_chars(str)
  chars = str.scan(/\\x(..)/).flatten.uniq
  chars.each do |char|
    str.gsub!("\\x#{char}", hex_to_unicode(char))
  end
  return str
 end

 def hex_to_unicode(str='\x5b')
  str.sub('\x','').hex.chr('UTF-8')
 end
	require 'yaml'
	require 'faraday'
	require 'nokogiri'

	# REVIEW: May not work for directories with larger numbers of files.
	# REVIEW: Formatting will likely be different for non-Google-Docs files.
	# REVIEW: This is stupid.
	def scrape_folder_contents(gdrive_folder_url, output_fname=nil)
	html = Faraday.get(gdrive_folder_url).body
	doc = Nokogiri::HTML(html)

	js = nil
	doc.css('script').each do \|script\|
	_js = script.inner_html.strip
	if _js.include?("window['_DRIVE_ivd']")
	js = _js
	break
	end
	end
	return nil unless js

	# Clean off cruft to get raw array describing folder contents.
	js.sub!("window['_DRIVE_ivd'] = '",'')
	js.sub!("';if (window['_DRIVE_ivdc']) {window['_DRIVE_ivdc']();}", '')

	# Clean up array formatting so Ruby can evaluate it.
	js = convert_hex_chars(js)
	js.gsub!('\n',' ')
	js.gsub!('\/', '/')

	# Remove all null values
	# REVIEW: May need to skip this when trying to parse non-Google-Docs files
	js.gsub!(',null','')

	# Evaluating the data contained in this Javascript gives an array of 3
	# items, the first of which will be an array of subarrays, each describing
	# one document within the GDrive folder.
	data = eval(js)
	doc_arr = data.first

	folder_contents = doc_arr.map do \|subarr\|
	id = subarr[0] # The 1st element in each doc array is the doc's ID
	title = subarr[2] # The 3rd is its title

	{
	title: title,
	id: id,
	}
	end

	File.write(output_fname, folder_contents.to_yaml) if output_fname
	return folder_contents
	end

	def convert_hex_chars(str)
	chars = str.scan(/\\x(..)/).flatten.uniq
	chars.each do \|char\|
	str.gsub!("\\x#{char}", hex_to_unicode(char))
	end
	return str
	end

	def hex_to_unicode(str='\x5b')
	str.sub('\x','').hex.chr('UTF-8')
	end