Skip to content

Instantly share code, notes, and snippets.

@jenningsanderson
Created October 29, 2015 02:56
Show Gist options
  • Save jenningsanderson/bcd632427913f54f1c99 to your computer and use it in GitHub Desktop.
Save jenningsanderson/bcd632427913f54f1c99 to your computer and use it in GitHub Desktop.

###Getting Files in Parallel from GNIP

require 'parallel'
require 'json'
require 'open-uri'

results = JSON.parse(File.read("/big_data/sandy/vuln_neighborhood_contextual/gnip_results.json"))
results["urlList"].count

uuid = 'hpr4y8xh4'


$logfile = "gnip_log.txt"
def log(string)
  File.open($logfile,'a') do |f|
    f << string << "\n"
  end
end


completed = Dir.entries(directory).select!{|x| x.end_with? ".gz"}

directory = "/big_data/sandy/vuln_neighborhood_contextual/gnip"

d = "/big_data/sandy/tmp/"
Dir.mkdir d
ENV["TMPDIR"] = d

processors = 24
files = results["urlList"].each_slice(results["urlList"].count/processors + 1).to_a

puts "Splitting #{results["urlList"].count} over #{processors} processors."

File.write($logfile,'')

Parallel.each_with_index(files, :in_processes => processors) do |files, pidx|
  files.each do |url|
    begin
       #Take URL and parse to create receiving file name.
      name = url[url.index(uuid)..(url.index(".gz?")+2)].gsub!("/","_")
      
      if completed.include? name
        log("ALREADY DONE: " + name)
        next
      else
        File.open(directory + "/" + name, "wb") do |new_file|
          # the following "open" is provided by open-uri
          open(url, 'rb') do |read_file|
            new_file.write(read_file.read)
          end
        end
        log("SUCCESS: " + name + " " + url)
      end
    rescue => e
      log("ERROR: " + name + e.to_s + " \n " + e.backtrace.to_s)
      exit(1)
    end
  end
end
"done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment