Last active
May 18, 2023 06:52
-
-
Save RyanEager/a747215c259ffecb9beb1dfec00e9d00 to your computer and use it in GitHub Desktop.
Scrape full sized images form vangoghmuseum.nl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Scrape full sized images from vangoghmuseum.nl | |
---- requrires.txt ------------------------------------------------------------ | |
beautifulsoup4==4.6.0 | |
bs4==0.0.1 | |
certifi==2018.4.16 | |
chardet==3.0.4 | |
idna==2.7 | |
Pillow==5.2.0 | |
python-slugify==1.2.5 | |
requests==2.19.1 | |
Unidecode==1.0.22 | |
urllib3==1.23 | |
------------------------------------------------------------------------------- | |
""" | |
import re | |
import requests | |
from io import BytesIO | |
from PIL import Image | |
from bs4 import BeautifulSoup | |
from bs4.dammit import EncodingDetector | |
from slugify import slugify | |
def download_img(id_val, file_name): | |
response = requests.get('https://vangoghmuseum-assetserver.appspot.com/tiles?id=%s' % id_val) | |
data = response.json() | |
img_tiles = data['levels'][0]['tiles'] | |
width = data['levels'][0]['width'] | |
height = data['levels'][0]['height'] | |
new_img = Image.new('RGB', (width, height)) | |
x_offset = 0 | |
y_offset = 0 | |
last_y = 0 | |
last_y_height = 0 | |
for tile in img_tiles: | |
img_rsp = requests.get(tile['url']) | |
img_tile = Image.open(BytesIO(img_rsp.content)) | |
last_y_height = img_tile.size[0] | |
if tile['y'] != last_y: | |
last_y = tile['y'] | |
y_offset += last_y_height | |
x_offset = 0 | |
new_img.paste(img_tile, (x_offset, y_offset)) | |
x_offset += img_tile.size[0] | |
new_img.save(file_name) | |
def scrape_urls(num): | |
''' | |
change this too limit to other artists or filters on the serach page i.e. | |
https://www.vangoghmuseum.nl/en/search/collection?q=&artist=Vincent%20van%20Gogh&pagesize= to just get van Gogh's work | |
''' | |
resp = requests.get("https://www.vangoghmuseum.nl/en/search/collection?q=&pagesize=" + str(num)) | |
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None | |
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) | |
encoding = html_encoding or http_encoding | |
soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding) | |
urls = [] | |
for link in soup.find_all('a', href=True): | |
if link['href'].startswith('/en/collection/'): | |
urls.append(link['href']) | |
for url in urls: | |
resp = requests.get('https://www.vangoghmuseum.nl' + url) | |
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None | |
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) | |
encoding = html_encoding or http_encoding | |
soup = BeautifulSoup(resp.content, "html.parser", from_encoding=encoding) | |
title = slugify(soup.find('a', attrs={'name': 'info'}).contents[0]) | |
data_id = soup.find(attrs={'data-id': re.compile("\d+")})['data-id'] | |
info = soup.find('div', attrs={'data-role': 'info'}).getText() | |
# clean up info text | |
info = re.sub(r'\n{3,}', '\n', info) | |
info = re.sub(r'Search in the collection:.*(?=\nObject data)', '', info, flags=re.DOTALL) | |
with open(title + '.txt','wb') as f: | |
f.write(info.encode('utf8')) | |
print 'Downloading: %s' % title | |
download_img(data_id, title + '.jpg') | |
if __name__ == '__main__': | |
# as of 2018-07-10 there are 1744 work so 2000 should get them all. | |
scrape_urls(2000) |
You don't get all images, because their names are not unique, e.g. there are 14 paintings named "woman".
It no longer works. I couldn't run the script successfully.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Saves both images and painting info.