-
-
Save henrik/812918 to your computer and use it in GitHub Desktop.
# Google Art Project fullsize image downloader. | |
# By Henrik Nyh <http://henrik.nyh.se> 2011-02-05 under the MIT license. | |
# Requires Ruby and ImageMagick. | |
# | |
# NOTE: | |
# I'm afraid this script no longer works! See the Gist comments. | |
# | |
# Usage e.g.: | |
# ruby google_art_project.rb http://www.googleartproject.com/museums/tate/portrait-of-william-style-of-langley-174 | |
# | |
# You can specify multiple URLs on the command line, separated by space. | |
# Or you can specify no URLs on the command line and instead list them at the end of this file, one on each line, | |
# with "__END__" before the list. | |
# | |
# On OS X, it sets "Downloaded from" metadata and reveals in Finder. | |
# | |
# Can reportedly run on Windows as well, with Ruby from http://www.ruby-lang.org/en/downloads/ | |
# and ImageMagick from http://www.imagemagick.org/script/binary-releases.php#windows | |
# Note that you may need to edit the TEMP_DIRECTORY/OUTPUT_DIRECTORY below. | |
require "open-uri" | |
require "fileutils" | |
require "rbconfig" | |
module Kernel | |
def windows? | |
Config::CONFIG['host_os'].match(/mswin|windows|mingw/i) | |
end | |
end | |
class GAPDownloader | |
# Set this to "jpg" or "tif". | |
# jpg is a lot smaller but destructively compressed. | |
OUTPUT_EXTENSION = "jpg" | |
if windows? | |
# Case-sensitive. Use forward slashes, or double-escape backslashes. | |
TEMP_DIRECTORY = "C:/WINDOWS/Temp" | |
OUTPUT_DIRECTORY = TEMP_DIRECTORY | |
else | |
TEMP_DIRECTORY = "/tmp" | |
OUTPUT_DIRECTORY = "#{ENV['HOME']}/Downloads" | |
FileUtils.mkdir_p OUTPUT_DIRECTORY | |
end | |
# You can lower this if you get ridiculously high-res images otherwise. | |
MAX_ZOOM_ALLOWED = 10 | |
class RuntimeError < StandardError; end | |
def initialize(url) | |
ensure_image_magick! | |
@url = url | |
verify_url! | |
end | |
def download | |
get_image_id | |
determine_zoom | |
get_tiles | |
stitch_tiles | |
trim | |
set_metadata | |
done | |
end | |
private | |
def ensure_image_magick! | |
if !windows? && `which montage`.empty? | |
error "You must have ImageMagick installed. Could not find 'montage' in your PATH." | |
end | |
end | |
def verify_url! | |
unless @url.to_s.match(%r{\Ahttp://www\.googleartproject\.com/}) | |
error "Please specify a Google Art Project URL." | |
end | |
end | |
def get_image_id | |
@html = open(@url).read | |
# Reportedly the data-thumbnail can change in the middle of a long download session, but | |
# the encodedInfospotId will not. So if we key local files by the InfospotId, we can | |
# check for them if download fails and we start over. Also makes for more human names. | |
# If I run into it myself, I may adapt the code to auto-resolve a changed data-thumbnail. | |
@thumb_id = @html[/data-thumbnail="(.+?)"/, 1] | |
@perma_id = @html[/data-encodedInfospotId="(.+?)"/, 1] | |
unless @thumb_id && @perma_id | |
error "Couldn't find an image at this URL, sorry!" | |
end | |
end | |
def determine_zoom | |
0.upto(MAX_ZOOM_ALLOWED) do |zoom| | |
open(tile_url(0, 0, zoom)) | |
@max_zoom = zoom | |
end | |
rescue OpenURI::HTTPError => e | |
raise unless e.message == "404 Not Found" | |
end | |
def get_tiles | |
@max_x = 999 | |
@max_y = 999 | |
0.upto(@max_y) do |y| | |
0.upto(@max_x) do |x| | |
url = tile_url(x, y, @max_zoom) | |
path = tile_path(x, y) | |
if File.exists?(path) | |
puts "Skipping #{url} (already downloaded)..." | |
next | |
end | |
begin | |
data = open(url) # Raises at 404. | |
puts "Getting #{url}..." | |
File.open(path, "wb") { |f| f.print data.read } | |
rescue OpenURI::HTTPError => e | |
raise unless e.message == "404 Not Found" | |
if y.zero? | |
# Found max x. Start on next row. | |
@max_x = x - 1 | |
break | |
else | |
# Found max y. We have all tiles, so bail. | |
@max_y = y - 1 | |
return | |
end | |
end | |
end | |
end | |
end | |
def stitch_tiles | |
# `montage` is ImageMagick. | |
# We first stitch together the tiles of each row, then stitch all rows. | |
# Stitching the full image all at once can get extremely inefficient for large images. | |
tiles_wide = @max_x + 1 | |
tiles_high = @max_y + 1 | |
puts "Stitching #{tiles_wide} x #{tiles_high} = #{tiles_wide*tiles_high} tiles..." | |
0.upto(@max_y) do |y| | |
tiles = (0..@max_x).map { |x| tile_path(x, y) }.join(' ') | |
`montage #{tiles} -geometry +0+0 -tile #{tiles_wide}x1 #{row_path(y)}` | |
end | |
tiles = (0..@max_y).map { |y| row_path(y) }.join(' ') | |
`montage #{tiles} -geometry +0+0 -tile 1x#{tiles_high} #{full_path}` | |
end | |
def trim | |
# Trim the black blocks that may appear on right and bottom. | |
# We first add a black border to ensure no other color is trimmed, as described on | |
# http://www.imagemagick.org/Usage/crop/#trim | |
`convert #{full_path} -bordercolor black -border 1x1 -trim #{full_path}` | |
end | |
def set_metadata | |
# 300 DPI instead of 72 DPI; more sane for printing. | |
`convert #{full_path} -density 300 #{full_path}` | |
if !windows? && !`which xattr`.empty? | |
# Set "Downloaded from" Finder metadata, like Safari does. | |
system('xattr', '-w', 'com.apple.metadata:kMDItemWhereFroms', @url, full_path) | |
end | |
end | |
def done | |
puts "Done: #{full_path}" | |
# Reveal in Finder if on OS X. | |
unless windows? | |
`which osascript && osascript -e 'tell app "Finder"' -e 'reveal POSIX file "#{full_path}"' -e 'activate' -e 'end'` | |
end | |
end | |
def error(message) | |
raise GAPDownloader::RuntimeError, "#{message} (#{@url})" | |
end | |
def tile_url(x, y, zoom) | |
# The subdomain can seemingly be anything from lh3 to lh6. | |
"http://lh5.ggpht.com/#{@thumb_id}=x#{x}-y#{y}-z#{zoom}" | |
end | |
def tile_path(x, y) | |
File.join(TEMP_DIRECTORY, "gap-#{@perma_id}-tile-#{x}-#{y}.jpg") | |
end | |
def row_path(y) | |
File.join(TEMP_DIRECTORY, "gap-#{@perma_id}-row-#{@max_zoom}-#{y}.#{OUTPUT_EXTENSION}") | |
end | |
def full_path | |
File.join(OUTPUT_DIRECTORY, "#{@perma_id}.#{OUTPUT_EXTENSION}") | |
end | |
end | |
if __FILE__ == $0 | |
urls = ARGV.any? ? ARGV : (defined?(DATA) ? DATA.read.strip.split("\n") : []) | |
puts "Error: No URLs given!" if urls.empty? | |
urls.each do |url| | |
begin | |
GAPDownloader.new(url).download | |
rescue GAPDownloader::RuntimeError => e | |
puts "Error: #{e.message}" | |
end | |
end | |
end |
Hi Parkjisun,
So what exactly your code does? How can I use it? Does it work on Google Art Project?
Actually Google has now a hmac hash appended to the URL. This HMAC is generated like this:
data: "[perma_id]=x[x pos]-y[y pos]-z[zoom level]-t[timestamp from xml]"
hash: hash_hmac('sha1', secret_key, data)
hash: hash.base64Encode.replace(/-/, '_')
hash: hash.substr(0, 27)
url: image_url + "-t" + hash
Well that's the easy part. To create the secret key they use an embedded google PNG logo and compute some stuff from some of its pixel colors. I'm personnally too busy/tired to rewrite the AS3 code in PHP/Ruby, but here it is in case someone's interested:
This function generates two bytearrays which are used after:
public static function initLogo(arg1:flash.utils.ByteArray, arg2:flash.utils.ByteArray):void
{
var loc2:*;
loc2 = 0;
var loc1:*;
loc1 = new GoogleLogo(); // returns bitmapdata of embedded google logo
var loc3:*;
loc3 = 1;
while (loc3 <= 6)
{
loc2 = loc1.bitmapData.getPixel(0, loc3);
arg1.writeByte(loc2 >> 16);
if (loc3 != 6)
{
arg1.writeByte(loc2 >> 8);
arg1.writeByte(loc2);
}
++loc3;
}
loc3 = 6;
while (loc3 <= 11)
{
loc2 = loc1.bitmapData.getPixel(0, loc3);
if (loc3 != 6)
{
arg2.writeByte(loc2 >> 16);
}
arg2.writeByte(loc2 >> 8);
if (loc3 != 11)
{
arg2.writeByte(loc2);
}
++loc3;
}
return;
}
This function takes the two byte arrays and make the secret key:
public function getLogChannel(arg1:flash.utils.ByteArray):void
{
var loc1:*;
loc1 = 8;
while (loc1 < 16)
{
arg1[(loc1 - 8)] = this.logChannelA[loc1] * 47 ^ this.logChannelB[loc1];
++loc1;
}
return;
}
The Google logo is here: http://i.kd2.org/i/34/6j9OHv5lP.google-logo.png
Actually I tried to rewrite this in PHP but I failed, I don't really use AS3, so ByteArray is a bit new for me, but maybe I'm missing some stuff.
hello,
i am new to osx and the terminal. anyway i installed ruby, imagemagick and xcode. i still got an error message using this script:
/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:277:in open_http': 403 Forbidden (OpenURI::HTTPError) from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:616:in
buffer_open'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:164:in open_loop' from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:162:in
catch'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:162:in open_loop' from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:132:in
open_uri'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:518:in open' from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:30:in
open'
from google_art_project.rb:94:in determine_zoom' from google_art_project.rb:93:in
upto'
from google_art_project.rb:93:in determine_zoom' from google_art_project.rb:57:in
download'
from google_art_project.rb:206
from google_art_project.rb:204:in `each'
from google_art_project.rb:204
Can anyone help me please?
Thanks in advance
@Flapomat I'm afraid this script doesn't work any more due to changes on Google's side.
What a pity. Do you know any other possibility to get the pictures in high res?
Can sadly confirm, also got HTTP 403:
user@MACHINE: /Applications/GoogleArtProject.RB/gist812918-0f80614973e377e34f3355130b6b8ae678b418b8 $ ruby google_art_project.rb http://www.googleartproject.com/museums/tate/portrait-of-william-style-of-langley-174
/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:277:in open_http': 403 Forbidden (OpenURI::HTTPError) from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:616:in
buffer_open'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:164:in open_loop' from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:162:in
catch'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:162:in open_loop' from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:132:in
open_uri'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:518:in open' from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:30:in
open'
from google_art_project.rb:94:in determine_zoom' from google_art_project.rb:93:in
upto'
from google_art_project.rb:93:in determine_zoom' from google_art_project.rb:57:in
download'
from google_art_project.rb:206
from google_art_project.rb:204:in `each'
from google_art_project.rb:204
hello, anyone can give some pictures who have downloaded from google project. i will be more thanks.and i really need it . many thanks
need help !!!
Hello, im running a Windows 7 64bit on Intel Core 2 Duo 4GB Ram Memory.
After installing Ruby and ImageMagick I run the command prompt and keep getting:
"Error: Could't find an image at this URL, sorry!"
Do you think you can help me over this?
Best Regards,
Paulo.
GAP website has changed again, and is now fully in JS.
NOWADAYS THIS SCRIPT DOES NOT WORK FOR ME. WHAT IS THE REASON?
I AM IN WINDOWS XP AND HERE IS THE ERROR MESSAGE: COULDN`T FIND AND IMAGE AT THIS URL SORRY
Again, this script no longer works. Emelyanenko Kirill mailed me about a script that is supposed to work, though: https://github.com/EmelyanenkoK/GAPDownloader Haven't tried it myself.
Hi henrik,
I try your code on Mac Lion and I get error urllib2.HTTPError: HTTP Error 403: Forbidden:
'''felixmatoMacBook-Pro:EmelyanenkoK-GAPDownloader-41f7003 felix$ python extractionGoogleArtProject_Unix.py
(http://www.googleartproject.com/collection/moma-the-museum-of-modern-art/artwork/the-starry-night-vincent-van-gogh/320268/)
BZ/7D6G3mWJ+JNvVDnZV/jcxS+E=
(http://lh6.ggpht.com/JFp6OJr9g8KABxRwCpABRdfc0Od2SQsguNwtn0qhvOkxkeFYZhQGrg=x0-y0-z4-tBZ_7D6G3mWJ_JNvVDnZV_jcxS_E)
Traceback (most recent call last):
File "extractionGoogleArtProject_Unix.py", line 290, in
telechargerOeuvre(i[0], i[1], i[2])
File "extractionGoogleArtProject_Unix.py", line 255, in telechargerOeuvre
telechargerTableau(urlImage, normaliserNomFichier(nomFichierImage), zoom)
File "extractionGoogleArtProject_Unix.py", line 215, in telechargerTableau
telechargerTousFragments(urlImage, xMax, yMax, zoom)
File "extractionGoogleArtProject_Unix.py", line 138, in telechargerTousFragments
telechargerFragment(urlImage, cheminFragment, x, y, zoom)
File "extractionGoogleArtProject_Unix.py", line 122, in telechargerFragment
contenuFragment = getContenuUrl(urlFragment, refererGoogleArtProject)
File "extractionGoogleArtProject_Unix.py", line 32, in getContenuUrl
return urllib2.urlopen(requete).read()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 400, in open
response = meth(req, response)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 438, in error
return self._call_chain(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 372, in _call_chain
result = func(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
Hi henrik,
I try your code on Mac Lion and get error (urllib2.HTTPError: HTTP Error 403: Forbidden).
Detail:
felixmatoMacBook-Pro:EmelyanenkoK-GAPDownloader-41f7003 felix$ python extractionGoogleArtProject_Unix.py
(http://www.googleartproject.com/collection/moma-the-museum-of-modern-art/artwork/the-starry-night-vincent-van-gogh/320268/)
BZ/7D6G3mWJ+JNvVDnZV/jcxS+E=
(http://lh6.ggpht.com/JFp6OJr9g8KABxRwCpABRdfc0Od2SQsguNwtn0qhvOkxkeFYZhQGrg=x0-y0-z4-tBZ_7D6G3mWJ_JNvVDnZV_jcxS_E)
Traceback (most recent call last):
File "extractionGoogleArtProject_Unix.py", line 290, in
telechargerOeuvre(i[0], i[1], i[2])
File "extractionGoogleArtProject_Unix.py", line 255, in telechargerOeuvre
telechargerTableau(urlImage, normaliserNomFichier(nomFichierImage), zoom)
File "extractionGoogleArtProject_Unix.py", line 215, in telechargerTableau
telechargerTousFragments(urlImage, xMax, yMax, zoom)
File "extractionGoogleArtProject_Unix.py", line 138, in telechargerTousFragments
telechargerFragment(urlImage, cheminFragment, x, y, zoom)
File "extractionGoogleArtProject_Unix.py", line 122, in telechargerFragment
contenuFragment = getContenuUrl(urlFragment, refererGoogleArtProject)
File "extractionGoogleArtProject_Unix.py", line 32, in getContenuUrl
return urllib2.urlopen(requete).read()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 400, in open
response = meth(req, response)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 438, in error
return self._call_chain(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 372, in _call_chain
result = func(_args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
Hi henrik, I try your new code on Mac Lion and get urllib2.HTTPError: HTTP Error 403: Forbidden.
Someone suggested this solution: http://www.student.tugraz.at/kollmitzer/gap_howto.html
or try gigafineart.heroku.com
gigafineart.heroku.com works, but does anybody know how they do it?
I'm kind of make it work, see a demo: https://www.dropbox.com/s/gih93ye4v1y6lsu/the_starry_night.jpg?dl=0
I am currently getting black canvases when downloading, and have seen comments of others getting the same
It is not future proof, but it should be a bit faster than the torrent and has images at high resolution:
http://commons.wikimedia.org/wiki/Category:Google_Art_Project