Last active
September 27, 2020 08:13
-
-
Save jianjieluo/7b5f07992fe408a6201a1a90d25dc8ce to your computer and use it in GitHub Desktop.
Download images or gifs given urls
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import csv | |
import argparse | |
import numpy as np | |
import urllib.request | |
#from urllib.request import urlopen | |
import socket | |
socket.setdefaulttimeout(10) | |
from urllib.parse import quote | |
import urllib.request, urllib.error | |
import requests | |
from tqdm import tqdm | |
OUTPUT_ROOT = 'gifs' | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Arg parser' | |
) | |
parser.add_argument('--url', default='data/splits/val.txt', type=str) | |
parser.add_argument('--begin', default=0,type=int) | |
parser.add_argument('--end', default=80000,type=int) | |
return parser.parse_args() | |
def main(): | |
args = parse_args() | |
print(args) | |
opener=urllib.request.build_opener() | |
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')] | |
urllib.request.install_opener(opener) | |
with open(args.url) as fid: | |
lines = [line.strip() for line in fid] | |
fid1 = open('bad_gid_HTTP_' + str(args.begin) + '.txt', 'w') | |
fid2 = open('bad_gid_URL_' + str(args.begin) + '.txt', 'w') | |
split = os.path.basename(args.url).split('.')[0] | |
output_dir = os.path.join(OUTPUT_ROOT, split) | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
total_len = min(len(lines), args.end - args.begin + 1) | |
with tqdm(total=total_len, ascii=True) as pbar: | |
for i, line in enumerate(lines): | |
if i < args.begin or i > args.end: | |
continue | |
pbar.update(1) | |
gid = '%s_%d' % (split, i) | |
gifUrl = line | |
url_ext = gifUrl.split('.')[-1] | |
if url_ext == 'gifv' or url_ext == 'gif': | |
ext = url_ext | |
else: | |
ext = 'gif' | |
out_path = os.path.join(output_dir, gid + '.' + ext) | |
if os.path.exists(out_path): | |
continue | |
#urllib.request.urlretrieve(quote(gifUrl, safe=':/=&?'), out_path) | |
try: | |
urllib.request.urlretrieve(quote(gifUrl, safe=':/=&?'), out_path) | |
except urllib.error.HTTPError as e: | |
print('HTTPError: {}'.format(e.code)) | |
fid1.write(gid + '\t' + str(e.code) +'\n') | |
except urllib.error.URLError as e: | |
print('URLError: {}'.format(e.reason)) | |
fid2.write(gid + '\n') | |
except: | |
print('Can not download the ' + gid + ' gif') | |
else: | |
pass | |
print('finish') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment