Last active
December 5, 2017 03:08
-
-
Save seven1m/5241bb427621392a924274c0e39c1585 to your computer and use it in GitHub Desktop.
download all images for twitter archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# gem install http | |
# unzip archive.zip -d archive | |
# cd archive | |
# ruby archive.rb | |
require 'http' | |
require 'fileutils' | |
require 'digest' | |
FileUtils.mkdir_p('media') | |
paths = Dir['data/**/*.js'].to_a + ['index.html'] | |
paths.each_with_index do |path, index| | |
puts "#{index + 1} of #{paths.size}" | |
data = File.read(path) | |
data.gsub!(/"(http[^"]+)(\.(ico|png|gif|jpg|jpeg|mov|mp4|mpg|mpeg))"/i) do | |
print '.' | |
ext = Regexp.last_match[2] | |
url = Regexp.last_match[1].gsub(%r{\\/}, '/') | |
name = Digest::MD5.hexdigest(url) + ext | |
asset_path = 'media/' + name | |
unless File.exist?(asset_path) | |
begin | |
raw = HTTP.get(url + ext).to_s | |
File.write(asset_path, raw) | |
rescue HTTP::ConnectionError | |
puts url + ext + ' could not be downloaded' | |
next | |
end | |
end | |
'"' + asset_path + '"' | |
end | |
File.write(path, data) | |
puts | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment