Created
May 2, 2019 09:58
-
-
Save gaiar/15e465d2a479accabd8194493bd3eae9 to your computer and use it in GitHub Desktop.
Download images for dataset using CSV as an input #birds-of-berlin
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from google_images_download import google_images_download | |
import random | |
import csv | |
import os | |
import sys | |
import logging | |
logging.basicConfig( | |
filename='downloader.log', | |
level=logging.DEBUG, | |
format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S', | |
) | |
DOWNLOAD_FOLDER = "data" | |
STOP_WORDS = "" | |
STOP_WORDS_SHUFFLE = False | |
DOWNLOAD_VALIDATION = True | |
USE_FINETUNING = True | |
USE_FINETUNING_EXTRA = False | |
NUM_IMAGES = 100 | |
def prepare_proxies(): | |
try: | |
with open('proxies.lst', newline='') as proxies: | |
pl = proxies.read().splitlines() | |
if len(pl) > 1: | |
return pl | |
except Exception as err: | |
logging.error("Problem opening proxies file. {0}".format(err)) | |
return None | |
def prepare_stop_words(): | |
try: | |
with open('stop-words', newline='') as stopwords: | |
sw = stopwords.read().splitlines() | |
if STOP_WORDS_SHUFFLE: | |
random.shuffle(sw) | |
sw = sw[:29] | |
words = ' -'.join([str(x) for x in sw]) | |
words = "-" + words | |
return words | |
except Exception as err: | |
logging.error("Problem opening stop-words file. {0}".format(err)) | |
return None | |
STOP_WORDS = prepare_stop_words() | |
PROXIES = prepare_proxies() | |
PROXIES = None | |
if STOP_WORDS is None: | |
print("[ERROR] :: Please check stop-words file") | |
sys.exit() | |
logging.debug("STOP WORDS to be used {0}".format(STOP_WORDS)) | |
logging.debug("PROXIES to be used {0}".format(PROXIES)) | |
PROXIES = None | |
def download_from_google(): | |
# Prepare config for google_images_download | |
arguments = dict() | |
arguments["limit"] = NUM_IMAGES | |
arguments["print_urls"] = False | |
arguments["format"] = "jpg" | |
arguments["size"] = ">400*300" | |
arguments["color_type"] = "full-color" | |
arguments["output_directory"] = DOWNLOAD_FOLDER | |
arguments["chromedriver"] = "chromedriver" | |
# Extra fine-tuning with special words | |
if USE_FINETUNING: | |
arguments["suffix_keywords"] = "winter,sommer,wald" | |
if USE_FINETUNING_EXTRA: | |
arguments["language"] = "German" | |
arguments["usage_rights"] = "labeled-for-reuse" | |
with open('berlin-birds-extended.csv', newline='') as csvfile: | |
birdreader = csv.DictReader(csvfile, delimiter=',') | |
for row in birdreader: | |
response = google_images_download.googleimagesdownload() | |
if len(row["alt_name"]) > 1: | |
keywords = "{0} OR {1} OR {2} {3}".format( | |
row["name"], row["alt_name"], row["latin_name"], STOP_WORDS) | |
else: | |
keywords = "{0} OR {1} {2}".format( | |
row["name"], row["latin_name"], STOP_WORDS) | |
arguments["keywords"] = keywords | |
prefix = "{0}{1}".format( | |
row["name"].lower().replace(" ", "_"), "_") | |
arguments["prefix"] = prefix | |
arguments["image_directory"] = row["name"] | |
if PROXIES is not None: | |
arguments["proxy"] = random.choice(PROXIES).strip() | |
logging.info("Using proxy {0}".format(arguments["proxy"])) | |
print("Downloading {}".format(arguments)) | |
logging.info("Downloading {}".format(arguments)) | |
try: | |
response.download(arguments) | |
except Exception as error: | |
logging.error( | |
"Problem downloading {0}. {1}".format(arguments, error)) | |
response.download(arguments) | |
DATA_DIR = "{}/test".format(DOWNLOAD_FOLDER) | |
def get_directory(bird_name): | |
directory = os.path.join(DATA_DIR, bird_name) | |
try: | |
os.mkdir(directory) | |
except OSError: | |
print("[WARN] :: Directory {0} already exist".format(directory)) | |
logging.warning("Directory {0} already exist".format(directory)) | |
else: | |
print("[INFO] :: Directory {0} created".format(directory)) | |
logging.info("Directory {0} created".format(directory)) | |
finally: | |
return directory | |
def dowload_original_files(): | |
with open('berlin-birds-extended.csv', newline='') as csvfile: | |
birdreader = csv.DictReader(csvfile, delimiter=',') | |
for row in birdreader: | |
print("[INFO] :: Downloading {0}".format(row["image_url"])) | |
logging.info("Downloading {0}".format(row["image_url"])) | |
try: | |
r = requests.get(row["image_url"], allow_redirects=True) | |
if row["image_url"].find('/'): | |
filename = row["image_url"].rsplit('/', 1)[1] | |
filename = os.path.join(get_directory(row["name"]), filename) | |
print("[INFO] :: Writing {0}".format(filename)) | |
logging.info("Writing {0}".format(filename)) | |
with open(filename, 'wb') as image_file: | |
image_file.write(r.content) | |
except Exception as e: | |
print("[ERROR] :: Problem downloading {0}. {1}".format( | |
row["image_url"], e)) | |
logging.error("Problem downloading {0}. {1}".format( | |
row["image_url"], e)) | |
download_from_google() | |
if DOWNLOAD_VALIDATION: | |
dowload_original_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment