Created
March 10, 2018 19:57
-
-
Save andylshort/2fa21a204bd8070d84c5f33e80882cca to your computer and use it in GitHub Desktop.
Small scripts to scrape the beautiful artwork of Simon Stålenhag from his website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
import urllib.error | |
import urllib.request | |
if len(sys.argv) != 2: | |
print("Please specify destination directory as an argument") | |
sys.exit(-1) | |
dest_folder = sys.argv[1] | |
if not os.path.isdir(dest_folder): | |
print("Destination is not a folder") | |
sys.exit(-1) | |
url = "http://www.simonstalenhag.se/" | |
image_regex = "href=\"bilderbig/([^\\.]*_\\d\\d\\d\\d)\\.jpg\"" | |
# Download the webpage | |
response = urllib.request.urlopen(url) | |
webContent = response.read() | |
# Scrape for image names | |
matches = set(re.findall(image_regex, str(webContent))) | |
matches = list(matches) | |
matches.sort() | |
if len(matches) > 0: | |
# Download all large versions of images | |
for match in matches: | |
image_url = url + "bilderbig/" + match + ".jpg" | |
destination = dest_folder + match + ".jpg" | |
print(image_url) | |
try: | |
with urllib.request.urlopen(image_url) as response, open(destination, 'wb') as out_file: | |
data = response.read() | |
out_file.write(data) | |
except urllib.error.HTTPError: | |
print("Could not download " + image_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment