Skip to content

Instantly share code, notes, and snippets.

@Kautenja
Last active June 13, 2019 13:44
Show Gist options
  • Save Kautenja/70826fce02e978e7c7e8a12e0e25dfb5 to your computer and use it in GitHub Desktop.
Save Kautenja/70826fce02e978e7c7e8a12e0e25dfb5 to your computer and use it in GitHub Desktop.
A script to search and scrape DeviantArt.com
"""A tool for scraping images from DeviantArt.com"""
import argparse
import logging
import multiprocessing
import os
import shutil
import re
import bs4
import requests
import tqdm
# setup the logger for the console
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)-15s: %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
# the available sort methods from DeviantArt.com
SORT_METHODS = [
'newest',
'whats-hot',
'undiscovered',
'popular-24-hours',
'popular-3-days',
'popular-1-week',
'popular-1-month',
'popular-all-time',
]
# the template string for the search link.
# Args:
# 1: the sort method to use
# 2: the search terms separated by a "+"
# 3: the offset to search on (i.e., the ith image)
URL = 'https://www.deviantart.com/{}/?q={}&offset={}'
# the URL for using categories
URL_CAT = 'https://www.deviantart.com/{}/{}/?q={}&offset={}'
# a regular expression for parsing image names from strings
IMAGE_RE = re.compile(r'([a-zA-Z0-9\-_]*\.(png|jpg))')
def get_args():
"""Parse command line arguments and return them."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--search',
type=str,
nargs='+',
required=True,
help='The search terms to use.'
)
parser.add_argument('--sort', '-s',
type=str,
default='newest',
choices=SORT_METHODS,
help='How to sort the search results on DeviantArt.com.'
)
parser.add_argument('--category', '-c',
type=str,
default=None,
help='The category to search within.'
)
parser.add_argument('--pages', '-p',
type=int,
default=1,
help='The number of pages of search results to scrape.'
)
parser.add_argument('--output', '-o',
type=str,
default='.',
help='The output directory to store images in.'
)
return parser.parse_args()
def search_page(search: list, sort: str, category=None, page=0) -> str:
"""
Get the images on a search page
Args:
search: the search method to use
sort: the sort method to use
category: the category to search in
page: the page number to search
Returns:
a list of images on the given page
"""
# create the link from the sort method and search terms
if category is None:
url = URL.format(sort, '+'.join(search), 20 * page)
else:
url = URL_CAT.format(category, sort, '+'.join(search), 20 * page)
logger.info('searching for images on: "%s"' % url)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.content, 'html.parser')
thumbs = soup.find_all('span', {'class' : 'thumb'})
imgs = []
for thumb in thumbs:
if thumb.has_attr('data-super-full-img'):
img = thumb['data-super-full-img']
elif thumb.has_attr('data-super-img'):
img = thumb['data-super-img']
else:
continue
imgs.append(img)
return imgs
def download_image(url: str, directory: str) -> str:
"""
Download an image from the given URL and write to the output directory.
Args:
url: the URL to download the image from
directory: the directory to write the image to
Returns:
the output path of the written image file (parse from the URL)
"""
# parse the filename and create an output path
try:
filename = IMAGE_RE.findall(url)[-1][0]
except IndexError:
return None
output_path = os.path.join(directory, filename)
# read the image from the URL (stream must be enabled)
logger.info('downloading "%s" to "%s"' % (url, output_path))
response = requests.get(url, stream=True)
# write the bytes to disk
with open(output_path, 'wb') as output_file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, output_file)
return output_path
if __name__ == '__main__':
args = get_args()
# make the output directory if it doesn't exist
if not os.path.exists(args.output):
os.makedirs(args.output)
# create a multiprocess pool for downloading images
pool = multiprocessing.Pool(multiprocessing.cpu_count())
# iterate over the number of pages to scrape
progress = tqdm.tqdm(range(args.pages), unit='page')
for page in progress:
# download the images links from the search page
images = search_page(args.search, args.sort, args.category, page)
if len(images) == 0:
print('terminating early, search results empty!')
break
# iterate over the images to save them to disk
work = zip(images, len(images) * [args.output])
output_paths = pool.starmap(download_image, work)
progress.close()
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment