Skip to content

Instantly share code, notes, and snippets.

@isaacgr
Created June 2, 2019 21:23
Show Gist options
  • Save isaacgr/433e8f6ee6fc1833fdc5dab97fb0c8bf to your computer and use it in GitHub Desktop.
Save isaacgr/433e8f6ee6fc1833fdc5dab97fb0c8bf to your computer and use it in GitHub Desktop.
Script to download images from https://www.artic.edu/collection
import urlparse
import urllib2
import os
import sys
from bs4 import BeautifulSoup
import requests
import shutil
# url = raw_input("[+] Enter the URL: ")
url = "https://www.artic.edu/collection/more?is_public_domain=1"
download_path = raw_input("[+] Enter the full download path: ")
pages = raw_input("[+] Enter the number of pages to fetch: ")
i = 0
for page in range(2, int(pages) + 1):
try:
print '[*] Requesting images'
r = requests.get(url + '&page=%s' % page)
soup = BeautifulSoup(r.content, 'lxml')
for tag in soup.find_all('img'):
try:
if tag.attrs['data-srcset']:
image_url = tag.attrs['data-srcset'].replace('\\', '').replace('"','').replace('200','1000')
if os.path.splitext(os.path.basename(image_url))[1] == '.jpg':
current = requests.get(image_url, stream=True)
print "\n[*] Downloading: %s" % (os.path.basename(image_url))
with open(download_path + str(i) + '.jpg', 'wb') as f:
current.raw.decode_content = True
shutil.copyfileobj(current.raw, f)
f.close()
i += 1
except:
print '[*] Could not download image'
continue
print "\n[*] Downloaded %d files" %(i)
except KeyboardInterrupt:
print "[*] Exiting"
except SyntaxError:
print "[*] Fix your code stupid"
sys.exit(1)
except:
print "[*] Could not get information from server %s" % e
continue
raw_input('[+] Press any key to exit')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment