Last active
May 11, 2020 21:00
-
-
Save rishipr/498a3757e9f93efc9bf9f3bf731896b9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from PIL import Image | |
import requests | |
import shutil | |
import urllib.request | |
from scraper import BREEDS | |
num_downloaded = 0 | |
def main(): | |
for b in BREEDS: | |
global num_downloaded | |
num_downloaded = 0 | |
print(f"Downloading images for {b} breed...") | |
file_path = f"data/{b.replace(' ', '_')}.txt" | |
in_file = open(file_path, 'r') | |
lines = in_file.readlines() | |
for l in lines: | |
if (num_downloaded == 300): | |
break | |
l = l.replace("\n", "").lower() | |
b = b.replace(' ', '') | |
# Skip if not an image file | |
if not l.endswith('.jpg'): | |
continue | |
try: | |
download_image(b, l) | |
except: | |
continue | |
print("\n") | |
in_file.close() | |
def download_image(breed, url): | |
global num_downloaded | |
# Check url | |
try: | |
r = requests.get(url, timeout=1) | |
r.raise_for_status() | |
except: | |
pass | |
if r.status_code == 200: | |
# Save image | |
output_name = f"images/{breed}-{num_downloaded}.jpg" | |
urllib.request.urlretrieve(url, output_name) | |
# Check to see if image is valid | |
if not check_validity(output_name): | |
# If corrupted image, delete image and exit function | |
if os.path.exists(output_name): | |
os.remove(output_name) | |
return | |
print(f"Saved {output_name}") | |
num_downloaded += 1 | |
else: | |
raise Exception('Bad url...') | |
def check_validity(img_name): | |
try: | |
img = Image.open(img_name) | |
img.verify() | |
return True | |
except (IOError, SyntaxError): | |
return False | |
if __name__ == '__main__': | |
if os.path.exists("images/"): | |
shutil.rmtree('./images', ignore_errors=True) | |
os.mkdir("./images") | |
else: | |
os.mkdir("./images") | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment