Last active
November 24, 2019 21:05
-
-
Save ma7555/276aae9de64a717d9ec25b74bf72c3d9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import petpy | |
import pandas as pd | |
import urllib.request | |
import time | |
import os | |
from ast import literal_eval | |
from tqdm import tqdm_notebook | |
def downloader(filename, image_url): | |
full_file_name = filename + '.jpg' | |
urllib.request.urlretrieve(image_url,full_file_name) | |
# Replace xx with API key and secret | |
pf = petpy.Petfinder(key='xx', secret='xx') | |
cats = pf.animals(results_per_page=100, pages=8000, return_df=True, animal_type='cat') | |
pure_cats_w_photos = cats[(~cats['breeds.mixed']) & (~cats['breeds.unknown']) & | |
(cats['breeds.secondary'].isna()) & ~cats['breeds.primary'].isna() & | |
cats['photos']][['id', 'url', 'type', 'age', 'gender', | |
'size', 'coat', 'breeds.primary', 'photos']] | |
pure_cats_w_photos.rename(columns={'breeds.primary': 'breed'}, inplace=True) | |
pure_cats_w_photos['med_photos'] = pure_cats_w_photos.photos.apply(lambda photos: [photo['medium'] for photo in photos]) | |
pure_cats_w_photos['breed'] = pure_cats_w_photos.breed.str.replace('/', '-') | |
pure_cats_w_photos.to_csv('cats.csv', index=False) | |
# pure_cats_w_photos = pd.read_csv('cats.csv') | |
# pure_cats_w_photos['med_photos'] = pure_cats_w_photos.med_photos.apply(literal_eval) | |
for my_folder in pure_cats_w_photos.breed.unique(): | |
if not os.path.exists(my_folder): | |
os.makedirs(my_folder) | |
downloaded_cats = [] | |
for dirname, _, filenames in os.walk('.'): | |
for filename in filenames: | |
if filename.endswith('.jpg'): | |
downloaded_cats.append(filename) | |
for ix, row in tqdm_notebook(pure_cats_w_photos.iterrows(), total=pure_cats_w_photos.shape[0]): | |
if str(row.id) in str(downloaded_cats): | |
continue | |
for photo_url in row.med_photos: | |
try: | |
downloader(r'{breed}\{id}_{ix}'.format(breed=row.breed, ix=ix, id=row.id), photo_url) | |
except HTTPError: | |
time.sleep(1) | |
continue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment