Skip to content

Instantly share code, notes, and snippets.

@takafumir
Last active October 20, 2023 01:00
Show Gist options
  • Save takafumir/f633bb4024bd3f70902b to your computer and use it in GitHub Desktop.
Save takafumir/f633bb4024bd3f70902b to your computer and use it in GitHub Desktop.
Twitpic downloader with Ruby
# Twitpic downloader with Ruby
#
# This tool enables you to save all your twitpic full-size images.
# Confirmed this tool working with Ruby 2.1.2.
#
# Usage
# $ mkdir work_dir
# $ ruby twitpic_downloader.rb user_name work_dir
#
# MIT License
# Copyright (c) 2014 Takafumi Yamano
require 'date'
require 'open-uri'
# prepare for saving images
USER_NAME = ARGV[0].to_s
WORK_DIR = ARGV[1].to_s
IMG_SAVE = 1
PREFIX = "twitpic-#{USER_NAME}"
if USER_NAME.empty?
puts "Error: You must supply your twitpic USER_NAME."
exit
end
unless Dir.exists?(WORK_DIR)
puts "Error: You must create the WORK_DIR beforehand."
exit
end
Dir.mkdir "#{WORK_DIR}/images" unless Dir.exists?("#{WORK_DIR}/images")
Dir.mkdir "#{WORK_DIR}/html" unless Dir.exists?("#{WORK_DIR}/html")
# download twitpic html pages
page = 1
while true
puts "page: #{page}"
input_url = "http://twitpic.com/photos/#{USER_NAME}?page=#{page}"
output_file = "#{WORK_DIR}/html/#{PREFIX}-page-#{page}.html"
unless File.exists?(output_file)
puts "download html: #{input_url}"
open(output_file, 'w') do |output|
open(input_url, 'r') do |html_data|
output.write(html_data.read)
end
end
end
break unless File.read(output_file) =~ /Next/
page += 1
end
# extract all image ids from downloaded html pages
image_ids = []
Dir.glob("#{WORK_DIR}/html/#{PREFIX}-page-*").each do |file|
image_ids.push File.read(file).scan(/<a href="\/([a-zA-Z0-9]+)">/).flatten
end
image_ids = image_ids.flatten.uniq.delete_if{|i| i == "sopapipa"}.sort
# download twitpic html pages of full size images
image_ids.each_with_index do |id, index|
puts "#{index+1}: #{id}"
full_url = "http://twitpic.com/#{id}/full"
full_file = "#{WORK_DIR}/html/#{PREFIX}-#{id}-full.html"
unless File.exists?(full_file)
puts "download full url: #{full_url}"
open(full_file, 'w') do |output|
open(full_url, 'r') do |html_data|
output.write(html_data.read)
end
end
end
end
# extract all full image urls
full_image_urls = {}
image_ids.each do |id|
file = "#{WORK_DIR}/html/#{PREFIX}-#{id}-full.html"
full_image_urls[id] = File.read(file).scan(/<img src="([^"]*)"/).flatten.grep(/(https:\/\/[^"]*)/){|i| $1}[0]
end
# download full images
unless IMG_SAVE == 1
puts "Warning: Didn't save full size images yet."
puts "Warning: Change IMG_SAVE to 1 in oreder to save full images."
exit
end
full_image_urls.each_with_index do |(id, url), index|
puts "#{index+1}: #{id}"
next if url.to_s.empty?
extension = url.scan(/\.([a-zA-Z]+)\?[0-9]+\z/).flatten[0]
full_image_file = "#{WORK_DIR}/images/#{PREFIX}-#{id}-full.#{extension}"
unless File.exists?(full_image_file)
puts "save full image: #{url}"
begin
open(full_image_file, 'wb') do |output|
open(url, 'rb') do |image_data|
output.write(image_data.read)
end
end
rescue
next
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment