Skip to content

Instantly share code, notes, and snippets.

@yu-iskw
Last active September 19, 2017 23:03
Show Gist options
  • Save yu-iskw/b501e94f805ae9e9dfe90ff72393b34c to your computer and use it in GitHub Desktop.
Save yu-iskw/b501e94f805ae9e9dfe90ff72393b34c to your computer and use it in GitHub Desktop.
Python で大量の JPEG 画像を高速にダウンロードするスクリプト ref: http://qiita.com/yu-iskw/items/303c8ae3828f93bbd3fc
usage: image-downloader.py [-h] --url-list URL_LIST_PATH --outputs OUTPUT_PATH
Options for image-downloader.py
optional arguments:
-h, --help show this help message and exit
--url-list URL_LIST_PATH
path to a list of image URLs
--outputs OUTPUT_PATH
path to a directory to store downloaded images
import os
import sys
import argparse
import imghdr
import concurrent.futures
from urllib.parse import urlparse
from urllib.request import urlretrieve
def get_parser():
"""Parse command line options"""
parser = argparse.ArgumentParser(description='Options for image-downloader.py')
parser.add_argument(
'--url-list',
type=str,
dest='url_list_path',
required=True,
help='path to a list of image URLs')
parser.add_argument(
'--outputs',
type=str,
dest='output_path',
required=True,
help='path to a directory to store downloaded images')
return parser
def is_jpeg(path):
if imghdr.what(path) == 'jpeg':
return True
else:
return False
def get_image(url, basepath):
parsed = urlparse(url)
file_name = parsed.path.split('/')[-1]
jpeg_file_path = None
try:
file_name = os.path.join(basepath, file_name)
downloaded_file_path, headers = urlretrieve(url, file_name)
if is_jpeg(downloaded_file_path):
print("%s is downloading" % (url))
jpeg_file_path = downloaded_file_path
except Exception as e:
print(e)
return jpeg_file_path
if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
# Make a dir
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as e:
url_list_path = args.url_list_path
f = open(url_list_path, 'r')
for line in f:
url = line.strip()
e.submit(get_image, url, args.output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment