Last active
September 19, 2017 23:03
-
-
Save yu-iskw/b501e94f805ae9e9dfe90ff72393b34c to your computer and use it in GitHub Desktop.
Python で大量の JPEG 画像を高速にダウンロードするスクリプト ref: http://qiita.com/yu-iskw/items/303c8ae3828f93bbd3fc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
usage: image-downloader.py [-h] --url-list URL_LIST_PATH --outputs OUTPUT_PATH | |
Options for image-downloader.py | |
optional arguments: | |
-h, --help show this help message and exit | |
--url-list URL_LIST_PATH | |
path to a list of image URLs | |
--outputs OUTPUT_PATH | |
path to a directory to store downloaded images |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import imghdr | |
import concurrent.futures | |
from urllib.parse import urlparse | |
from urllib.request import urlretrieve | |
def get_parser(): | |
"""Parse command line options""" | |
parser = argparse.ArgumentParser(description='Options for image-downloader.py') | |
parser.add_argument( | |
'--url-list', | |
type=str, | |
dest='url_list_path', | |
required=True, | |
help='path to a list of image URLs') | |
parser.add_argument( | |
'--outputs', | |
type=str, | |
dest='output_path', | |
required=True, | |
help='path to a directory to store downloaded images') | |
return parser | |
def is_jpeg(path): | |
if imghdr.what(path) == 'jpeg': | |
return True | |
else: | |
return False | |
def get_image(url, basepath): | |
parsed = urlparse(url) | |
file_name = parsed.path.split('/')[-1] | |
jpeg_file_path = None | |
try: | |
file_name = os.path.join(basepath, file_name) | |
downloaded_file_path, headers = urlretrieve(url, file_name) | |
if is_jpeg(downloaded_file_path): | |
print("%s is downloading" % (url)) | |
jpeg_file_path = downloaded_file_path | |
except Exception as e: | |
print(e) | |
return jpeg_file_path | |
if __name__ == '__main__': | |
parser = get_parser() | |
args = parser.parse_args() | |
# Make a dir | |
if not os.path.exists(args.output_path): | |
os.makedirs(args.output_path) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as e: | |
url_list_path = args.url_list_path | |
f = open(url_list_path, 'r') | |
for line in f: | |
url = line.strip() | |
e.submit(get_image, url, args.output_path) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment