-
-
Save henrik/812918 to your computer and use it in GitHub Desktop.
# Google Art Project fullsize image downloader. | |
# By Henrik Nyh <http://henrik.nyh.se> 2011-02-05 under the MIT license. | |
# Requires Ruby and ImageMagick. | |
# | |
# NOTE: | |
# I'm afraid this script no longer works! See the Gist comments. | |
# | |
# Usage e.g.: | |
# ruby google_art_project.rb http://www.googleartproject.com/museums/tate/portrait-of-william-style-of-langley-174 | |
# | |
# You can specify multiple URLs on the command line, separated by space. | |
# Or you can specify no URLs on the command line and instead list them at the end of this file, one on each line, | |
# with "__END__" before the list. | |
# | |
# On OS X, it sets "Downloaded from" metadata and reveals in Finder. | |
# | |
# Can reportedly run on Windows as well, with Ruby from http://www.ruby-lang.org/en/downloads/ | |
# and ImageMagick from http://www.imagemagick.org/script/binary-releases.php#windows | |
# Note that you may need to edit the TEMP_DIRECTORY/OUTPUT_DIRECTORY below. | |
require "open-uri" | |
require "fileutils" | |
require "rbconfig" | |
module Kernel | |
def windows? | |
Config::CONFIG['host_os'].match(/mswin|windows|mingw/i) | |
end | |
end | |
class GAPDownloader | |
# Set this to "jpg" or "tif". | |
# jpg is a lot smaller but destructively compressed. | |
OUTPUT_EXTENSION = "jpg" | |
if windows? | |
# Case-sensitive. Use forward slashes, or double-escape backslashes. | |
TEMP_DIRECTORY = "C:/WINDOWS/Temp" | |
OUTPUT_DIRECTORY = TEMP_DIRECTORY | |
else | |
TEMP_DIRECTORY = "/tmp" | |
OUTPUT_DIRECTORY = "#{ENV['HOME']}/Downloads" | |
FileUtils.mkdir_p OUTPUT_DIRECTORY | |
end | |
# You can lower this if you get ridiculously high-res images otherwise. | |
MAX_ZOOM_ALLOWED = 10 | |
class RuntimeError < StandardError; end | |
def initialize(url) | |
ensure_image_magick! | |
@url = url | |
verify_url! | |
end | |
def download | |
get_image_id | |
determine_zoom | |
get_tiles | |
stitch_tiles | |
trim | |
set_metadata | |
done | |
end | |
private | |
def ensure_image_magick! | |
if !windows? && `which montage`.empty? | |
error "You must have ImageMagick installed. Could not find 'montage' in your PATH." | |
end | |
end | |
def verify_url! | |
unless @url.to_s.match(%r{\Ahttp://www\.googleartproject\.com/}) | |
error "Please specify a Google Art Project URL." | |
end | |
end | |
def get_image_id | |
@html = open(@url).read | |
# Reportedly the data-thumbnail can change in the middle of a long download session, but | |
# the encodedInfospotId will not. So if we key local files by the InfospotId, we can | |
# check for them if download fails and we start over. Also makes for more human names. | |
# If I run into it myself, I may adapt the code to auto-resolve a changed data-thumbnail. | |
@thumb_id = @html[/data-thumbnail="(.+?)"/, 1] | |
@perma_id = @html[/data-encodedInfospotId="(.+?)"/, 1] | |
unless @thumb_id && @perma_id | |
error "Couldn't find an image at this URL, sorry!" | |
end | |
end | |
def determine_zoom | |
0.upto(MAX_ZOOM_ALLOWED) do |zoom| | |
open(tile_url(0, 0, zoom)) | |
@max_zoom = zoom | |
end | |
rescue OpenURI::HTTPError => e | |
raise unless e.message == "404 Not Found" | |
end | |
def get_tiles | |
@max_x = 999 | |
@max_y = 999 | |
0.upto(@max_y) do |y| | |
0.upto(@max_x) do |x| | |
url = tile_url(x, y, @max_zoom) | |
path = tile_path(x, y) | |
if File.exists?(path) | |
puts "Skipping #{url} (already downloaded)..." | |
next | |
end | |
begin | |
data = open(url) # Raises at 404. | |
puts "Getting #{url}..." | |
File.open(path, "wb") { |f| f.print data.read } | |
rescue OpenURI::HTTPError => e | |
raise unless e.message == "404 Not Found" | |
if y.zero? | |
# Found max x. Start on next row. | |
@max_x = x - 1 | |
break | |
else | |
# Found max y. We have all tiles, so bail. | |
@max_y = y - 1 | |
return | |
end | |
end | |
end | |
end | |
end | |
def stitch_tiles | |
# `montage` is ImageMagick. | |
# We first stitch together the tiles of each row, then stitch all rows. | |
# Stitching the full image all at once can get extremely inefficient for large images. | |
tiles_wide = @max_x + 1 | |
tiles_high = @max_y + 1 | |
puts "Stitching #{tiles_wide} x #{tiles_high} = #{tiles_wide*tiles_high} tiles..." | |
0.upto(@max_y) do |y| | |
tiles = (0..@max_x).map { |x| tile_path(x, y) }.join(' ') | |
`montage #{tiles} -geometry +0+0 -tile #{tiles_wide}x1 #{row_path(y)}` | |
end | |
tiles = (0..@max_y).map { |y| row_path(y) }.join(' ') | |
`montage #{tiles} -geometry +0+0 -tile 1x#{tiles_high} #{full_path}` | |
end | |
def trim | |
# Trim the black blocks that may appear on right and bottom. | |
# We first add a black border to ensure no other color is trimmed, as described on | |
# http://www.imagemagick.org/Usage/crop/#trim | |
`convert #{full_path} -bordercolor black -border 1x1 -trim #{full_path}` | |
end | |
def set_metadata | |
# 300 DPI instead of 72 DPI; more sane for printing. | |
`convert #{full_path} -density 300 #{full_path}` | |
if !windows? && !`which xattr`.empty? | |
# Set "Downloaded from" Finder metadata, like Safari does. | |
system('xattr', '-w', 'com.apple.metadata:kMDItemWhereFroms', @url, full_path) | |
end | |
end | |
def done | |
puts "Done: #{full_path}" | |
# Reveal in Finder if on OS X. | |
unless windows? | |
`which osascript && osascript -e 'tell app "Finder"' -e 'reveal POSIX file "#{full_path}"' -e 'activate' -e 'end'` | |
end | |
end | |
def error(message) | |
raise GAPDownloader::RuntimeError, "#{message} (#{@url})" | |
end | |
def tile_url(x, y, zoom) | |
# The subdomain can seemingly be anything from lh3 to lh6. | |
"http://lh5.ggpht.com/#{@thumb_id}=x#{x}-y#{y}-z#{zoom}" | |
end | |
def tile_path(x, y) | |
File.join(TEMP_DIRECTORY, "gap-#{@perma_id}-tile-#{x}-#{y}.jpg") | |
end | |
def row_path(y) | |
File.join(TEMP_DIRECTORY, "gap-#{@perma_id}-row-#{@max_zoom}-#{y}.#{OUTPUT_EXTENSION}") | |
end | |
def full_path | |
File.join(OUTPUT_DIRECTORY, "#{@perma_id}.#{OUTPUT_EXTENSION}") | |
end | |
end | |
if __FILE__ == $0 | |
urls = ARGV.any? ? ARGV : (defined?(DATA) ? DATA.read.strip.split("\n") : []) | |
puts "Error: No URLs given!" if urls.empty? | |
urls.each do |url| | |
begin | |
GAPDownloader.new(url).download | |
rescue GAPDownloader::RuntimeError => e | |
puts "Error: #{e.message}" | |
end | |
end | |
end |
Again, this script no longer works. Emelyanenko Kirill mailed me about a script that is supposed to work, though: https://github.com/EmelyanenkoK/GAPDownloader Haven't tried it myself.
Hi henrik,
I try your code on Mac Lion and I get error urllib2.HTTPError: HTTP Error 403: Forbidden:
'''felixmatoMacBook-Pro:EmelyanenkoK-GAPDownloader-41f7003 felix$ python extractionGoogleArtProject_Unix.py
(http://www.googleartproject.com/collection/moma-the-museum-of-modern-art/artwork/the-starry-night-vincent-van-gogh/320268/)
BZ/7D6G3mWJ+JNvVDnZV/jcxS+E=
(http://lh6.ggpht.com/JFp6OJr9g8KABxRwCpABRdfc0Od2SQsguNwtn0qhvOkxkeFYZhQGrg=x0-y0-z4-tBZ_7D6G3mWJ_JNvVDnZV_jcxS_E)
Traceback (most recent call last):
File "extractionGoogleArtProject_Unix.py", line 290, in
telechargerOeuvre(i[0], i[1], i[2])
File "extractionGoogleArtProject_Unix.py", line 255, in telechargerOeuvre
telechargerTableau(urlImage, normaliserNomFichier(nomFichierImage), zoom)
File "extractionGoogleArtProject_Unix.py", line 215, in telechargerTableau
telechargerTousFragments(urlImage, xMax, yMax, zoom)
File "extractionGoogleArtProject_Unix.py", line 138, in telechargerTousFragments
telechargerFragment(urlImage, cheminFragment, x, y, zoom)
File "extractionGoogleArtProject_Unix.py", line 122, in telechargerFragment
contenuFragment = getContenuUrl(urlFragment, refererGoogleArtProject)
File "extractionGoogleArtProject_Unix.py", line 32, in getContenuUrl
return urllib2.urlopen(requete).read()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 400, in open
response = meth(req, response)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 438, in error
return self._call_chain(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 372, in _call_chain
result = func(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
Hi henrik,
I try your code on Mac Lion and get error (urllib2.HTTPError: HTTP Error 403: Forbidden).
Detail:
felixmatoMacBook-Pro:EmelyanenkoK-GAPDownloader-41f7003 felix$ python extractionGoogleArtProject_Unix.py
(http://www.googleartproject.com/collection/moma-the-museum-of-modern-art/artwork/the-starry-night-vincent-van-gogh/320268/)
BZ/7D6G3mWJ+JNvVDnZV/jcxS+E=
(http://lh6.ggpht.com/JFp6OJr9g8KABxRwCpABRdfc0Od2SQsguNwtn0qhvOkxkeFYZhQGrg=x0-y0-z4-tBZ_7D6G3mWJ_JNvVDnZV_jcxS_E)
Traceback (most recent call last):
File "extractionGoogleArtProject_Unix.py", line 290, in
telechargerOeuvre(i[0], i[1], i[2])
File "extractionGoogleArtProject_Unix.py", line 255, in telechargerOeuvre
telechargerTableau(urlImage, normaliserNomFichier(nomFichierImage), zoom)
File "extractionGoogleArtProject_Unix.py", line 215, in telechargerTableau
telechargerTousFragments(urlImage, xMax, yMax, zoom)
File "extractionGoogleArtProject_Unix.py", line 138, in telechargerTousFragments
telechargerFragment(urlImage, cheminFragment, x, y, zoom)
File "extractionGoogleArtProject_Unix.py", line 122, in telechargerFragment
contenuFragment = getContenuUrl(urlFragment, refererGoogleArtProject)
File "extractionGoogleArtProject_Unix.py", line 32, in getContenuUrl
return urllib2.urlopen(requete).read()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 400, in open
response = meth(req, response)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 438, in error
return self._call_chain(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 372, in _call_chain
result = func(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
Hi henrik, I try your new code on Mac Lion and get urllib2.HTTPError: HTTP Error 403: Forbidden.
Someone suggested this solution: http://www.student.tugraz.at/kollmitzer/gap_howto.html
or try gigafineart.heroku.com
gigafineart.heroku.com works, but does anybody know how they do it?
I'm kind of make it work, see a demo: https://www.dropbox.com/s/gih93ye4v1y6lsu/the_starry_night.jpg?dl=0
I am currently getting black canvases when downloading, and have seen comments of others getting the same
NOWADAYS THIS SCRIPT DOES NOT WORK FOR ME. WHAT IS THE REASON?
I AM IN WINDOWS XP AND HERE IS THE ERROR MESSAGE: COULDN`T FIND AND IMAGE AT THIS URL SORRY