Kautenja · June 13, 2019 13:44
diff --git a/scrape_deviant_art.py b/scrape_deviant_art.py
 """A tool for scraping images from DeviantArt.com"""
 import argparse
 import logging
 import multiprocessing
 import os
 import shutil
 import re
 import bs4
 import requests
 import tqdm


 # setup the logger for the console
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 ch = logging.StreamHandler()
 ch.setLevel(logging.INFO)
 formatter = logging.Formatter('%(asctime)-15s: %(message)s')
 ch.setFormatter(formatter)
 logger.addHandler(ch)


 # the available sort methods from DeviantArt.com
 SORT_METHODS = [
    'newest',
    'whats-hot',
    'undiscovered',
    'popular-24-hours',
    'popular-3-days',
    'popular-1-week',
    'popular-1-month',
    'popular-all-time',
 ]


 # the template string for the search link.
 # Args:
 #     1: the sort method to use
 #     2: the search terms separated by a "+"
 #     3: the offset to search on (i.e., the ith image)
 URL = 'https://www.deviantart.com/{}/?q={}&offset={}'
 # the URL for using categories
 URL_CAT = 'https://www.deviantart.com/{}/{}/?q={}&offset={}'


 # a regular expression for parsing image names from strings
 IMAGE_RE = re.compile(r'([a-zA-Z0-9\-_]*\.(png|jpg))')


 def get_args():
    """Parse command line arguments and return them."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--search',
        type=str,
        nargs='+',
        required=True,
        help='The search terms to use.'
    )
    parser.add_argument('--sort', '-s',
        type=str,
        default='newest',
        choices=SORT_METHODS,
        help='How to sort the search results on DeviantArt.com.'
    )
    parser.add_argument('--category', '-c',
        type=str,
        default=None,
        help='The category to search within.'
    )
    parser.add_argument('--pages', '-p',
        type=int,
        default=1,
        help='The number of pages of search results to scrape.'
    )
    parser.add_argument('--output', '-o',
        type=str,
        default='.',
        help='The output directory to store images in.'
    )
    return parser.parse_args()


 def search_page(search: list, sort: str, category=None, page=0) -> str:
    """
    Get the images on a search page

    Args:
        search: the search method to use
        sort: the sort method to use
        category: the category to search in
        page: the page number to search

    Returns:
        a list of images on the given page

    """
    # create the link from the sort method and search terms
    if category is None:
        url = URL.format(sort, '+'.join(search), 20 * page)
    else:
        url = URL_CAT.format(category, sort, '+'.join(search), 20 * page)
    logger.info('searching for images on: "%s"' % url)
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    thumbs = soup.find_all('span', {'class' : 'thumb'})
    imgs = []
    for thumb in thumbs:
        if thumb.has_attr('data-super-full-img'):
            img = thumb['data-super-full-img']
        elif thumb.has_attr('data-super-img'):
            img = thumb['data-super-img']
        else:
            continue
        imgs.append(img)
    return imgs


 def download_image(url: str, directory: str) -> str:
    """
    Download an image from the given URL and write to the output directory.

    Args:
        url: the URL to download the image from
        directory: the directory to write the image to

    Returns:
        the output path of the written image file (parse from the URL)

    """
    # parse the filename and create an output path
    try:
        filename = IMAGE_RE.findall(url)[-1][0]
    except IndexError:
        return None
    output_path = os.path.join(directory, filename)
    # read the image from the URL (stream must be enabled)
    logger.info('downloading "%s" to "%s"' % (url, output_path))
    response = requests.get(url, stream=True)
    # write the bytes to disk
    with open(output_path, 'wb') as output_file:
        response.raw.decode_content = True
        shutil.copyfileobj(response.raw, output_file)

    return output_path


 if __name__ == '__main__':
    args = get_args()
    # make the output directory if it doesn't exist
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    # create a multiprocess pool for downloading images
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    # iterate over the number of pages to scrape
    progress = tqdm.tqdm(range(args.pages), unit='page')
    for page in progress:
        # download the images links from the search page
        images = search_page(args.search, args.sort, args.category, page)
        if len(images) == 0:
            print('terminating early, search results empty!')
            break
        # iterate over the images to save them to disk
        work = zip(images, len(images) * [args.output])
        output_paths = pool.starmap(download_image, work)

    progress.close()
    pool.close()
    pool.join()
	"""A tool for scraping images from DeviantArt.com"""
	import argparse
	import logging
	import multiprocessing
	import os
	import shutil
	import re
	import bs4
	import requests
	import tqdm


	# setup the logger for the console
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)
	ch = logging.StreamHandler()
	ch.setLevel(logging.INFO)
	formatter = logging.Formatter('%(asctime)-15s: %(message)s')
	ch.setFormatter(formatter)
	logger.addHandler(ch)


	# the available sort methods from DeviantArt.com
	SORT_METHODS = [
	'newest',
	'whats-hot',
	'undiscovered',
	'popular-24-hours',
	'popular-3-days',
	'popular-1-week',
	'popular-1-month',
	'popular-all-time',
	]


	# the template string for the search link.
	# Args:
	# 1: the sort method to use
	# 2: the search terms separated by a "+"
	# 3: the offset to search on (i.e., the ith image)
	URL = 'https://www.deviantart.com/{}/?q={}&offset={}'
	# the URL for using categories
	URL_CAT = 'https://www.deviantart.com/{}/{}/?q={}&offset={}'


	# a regular expression for parsing image names from strings
	IMAGE_RE = re.compile(r'([a-zA-Z0-9\-_]*\.(png\|jpg))')


	def get_args():
	"""Parse command line arguments and return them."""
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument('--search',
	type=str,
	nargs='+',
	required=True,
	help='The search terms to use.'
	)
	parser.add_argument('--sort', '-s',
	type=str,
	default='newest',
	choices=SORT_METHODS,
	help='How to sort the search results on DeviantArt.com.'
	)
	parser.add_argument('--category', '-c',
	type=str,
	default=None,
	help='The category to search within.'
	)
	parser.add_argument('--pages', '-p',
	type=int,
	default=1,
	help='The number of pages of search results to scrape.'
	)
	parser.add_argument('--output', '-o',
	type=str,
	default='.',
	help='The output directory to store images in.'
	)
	return parser.parse_args()


	def search_page(search: list, sort: str, category=None, page=0) -> str:
	"""
	Get the images on a search page

	Args:
	search: the search method to use
	sort: the sort method to use
	category: the category to search in
	page: the page number to search

	Returns:
	a list of images on the given page

	"""
	# create the link from the sort method and search terms
	if category is None:
	url = URL.format(sort, '+'.join(search), 20 * page)
	else:
	url = URL_CAT.format(category, sort, '+'.join(search), 20 * page)
	logger.info('searching for images on: "%s"' % url)
	response = requests.get(url)
	soup = bs4.BeautifulSoup(response.content, 'html.parser')
	thumbs = soup.find_all('span', {'class' : 'thumb'})
	imgs = []
	for thumb in thumbs:
	if thumb.has_attr('data-super-full-img'):
	img = thumb['data-super-full-img']
	elif thumb.has_attr('data-super-img'):
	img = thumb['data-super-img']
	else:
	continue
	imgs.append(img)
	return imgs


	def download_image(url: str, directory: str) -> str:
	"""
	Download an image from the given URL and write to the output directory.

	Args:
	url: the URL to download the image from
	directory: the directory to write the image to

	Returns:
	the output path of the written image file (parse from the URL)

	"""
	# parse the filename and create an output path
	try:
	filename = IMAGE_RE.findall(url)[-1][0]
	except IndexError:
	return None
	output_path = os.path.join(directory, filename)
	# read the image from the URL (stream must be enabled)
	logger.info('downloading "%s" to "%s"' % (url, output_path))
	response = requests.get(url, stream=True)
	# write the bytes to disk
	with open(output_path, 'wb') as output_file:
	response.raw.decode_content = True
	shutil.copyfileobj(response.raw, output_file)

	return output_path


	if __name__ == '__main__':
	args = get_args()
	# make the output directory if it doesn't exist
	if not os.path.exists(args.output):
	os.makedirs(args.output)
	# create a multiprocess pool for downloading images
	pool = multiprocessing.Pool(multiprocessing.cpu_count())
	# iterate over the number of pages to scrape
	progress = tqdm.tqdm(range(args.pages), unit='page')
	for page in progress:
	# download the images links from the search page
	images = search_page(args.search, args.sort, args.category, page)
	if len(images) == 0:
	print('terminating early, search results empty!')
	break
	# iterate over the images to save them to disk
	work = zip(images, len(images) * [args.output])
	output_paths = pool.starmap(download_image, work)

	progress.close()
	pool.close()
	pool.join()