Last active
June 13, 2019 13:44
-
-
Save Kautenja/70826fce02e978e7c7e8a12e0e25dfb5 to your computer and use it in GitHub Desktop.
A script to search and scrape DeviantArt.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A tool for scraping images from DeviantArt.com""" | |
import argparse | |
import logging | |
import multiprocessing | |
import os | |
import shutil | |
import re | |
import bs4 | |
import requests | |
import tqdm | |
# setup the logger for the console | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.INFO) | |
formatter = logging.Formatter('%(asctime)-15s: %(message)s') | |
ch.setFormatter(formatter) | |
logger.addHandler(ch) | |
# the available sort methods from DeviantArt.com | |
SORT_METHODS = [ | |
'newest', | |
'whats-hot', | |
'undiscovered', | |
'popular-24-hours', | |
'popular-3-days', | |
'popular-1-week', | |
'popular-1-month', | |
'popular-all-time', | |
] | |
# the template string for the search link. | |
# Args: | |
# 1: the sort method to use | |
# 2: the search terms separated by a "+" | |
# 3: the offset to search on (i.e., the ith image) | |
URL = 'https://www.deviantart.com/{}/?q={}&offset={}' | |
# the URL for using categories | |
URL_CAT = 'https://www.deviantart.com/{}/{}/?q={}&offset={}' | |
# a regular expression for parsing image names from strings | |
IMAGE_RE = re.compile(r'([a-zA-Z0-9\-_]*\.(png|jpg))') | |
def get_args(): | |
"""Parse command line arguments and return them.""" | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('--search', | |
type=str, | |
nargs='+', | |
required=True, | |
help='The search terms to use.' | |
) | |
parser.add_argument('--sort', '-s', | |
type=str, | |
default='newest', | |
choices=SORT_METHODS, | |
help='How to sort the search results on DeviantArt.com.' | |
) | |
parser.add_argument('--category', '-c', | |
type=str, | |
default=None, | |
help='The category to search within.' | |
) | |
parser.add_argument('--pages', '-p', | |
type=int, | |
default=1, | |
help='The number of pages of search results to scrape.' | |
) | |
parser.add_argument('--output', '-o', | |
type=str, | |
default='.', | |
help='The output directory to store images in.' | |
) | |
return parser.parse_args() | |
def search_page(search: list, sort: str, category=None, page=0) -> str: | |
""" | |
Get the images on a search page | |
Args: | |
search: the search method to use | |
sort: the sort method to use | |
category: the category to search in | |
page: the page number to search | |
Returns: | |
a list of images on the given page | |
""" | |
# create the link from the sort method and search terms | |
if category is None: | |
url = URL.format(sort, '+'.join(search), 20 * page) | |
else: | |
url = URL_CAT.format(category, sort, '+'.join(search), 20 * page) | |
logger.info('searching for images on: "%s"' % url) | |
response = requests.get(url) | |
soup = bs4.BeautifulSoup(response.content, 'html.parser') | |
thumbs = soup.find_all('span', {'class' : 'thumb'}) | |
imgs = [] | |
for thumb in thumbs: | |
if thumb.has_attr('data-super-full-img'): | |
img = thumb['data-super-full-img'] | |
elif thumb.has_attr('data-super-img'): | |
img = thumb['data-super-img'] | |
else: | |
continue | |
imgs.append(img) | |
return imgs | |
def download_image(url: str, directory: str) -> str: | |
""" | |
Download an image from the given URL and write to the output directory. | |
Args: | |
url: the URL to download the image from | |
directory: the directory to write the image to | |
Returns: | |
the output path of the written image file (parse from the URL) | |
""" | |
# parse the filename and create an output path | |
try: | |
filename = IMAGE_RE.findall(url)[-1][0] | |
except IndexError: | |
return None | |
output_path = os.path.join(directory, filename) | |
# read the image from the URL (stream must be enabled) | |
logger.info('downloading "%s" to "%s"' % (url, output_path)) | |
response = requests.get(url, stream=True) | |
# write the bytes to disk | |
with open(output_path, 'wb') as output_file: | |
response.raw.decode_content = True | |
shutil.copyfileobj(response.raw, output_file) | |
return output_path | |
if __name__ == '__main__': | |
args = get_args() | |
# make the output directory if it doesn't exist | |
if not os.path.exists(args.output): | |
os.makedirs(args.output) | |
# create a multiprocess pool for downloading images | |
pool = multiprocessing.Pool(multiprocessing.cpu_count()) | |
# iterate over the number of pages to scrape | |
progress = tqdm.tqdm(range(args.pages), unit='page') | |
for page in progress: | |
# download the images links from the search page | |
images = search_page(args.search, args.sort, args.category, page) | |
if len(images) == 0: | |
print('terminating early, search results empty!') | |
break | |
# iterate over the images to save them to disk | |
work = zip(images, len(images) * [args.output]) | |
output_paths = pool.starmap(download_image, work) | |
progress.close() | |
pool.close() | |
pool.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment