Created
July 26, 2020 16:00
-
-
Save dlebech/4a50650b7d594553cdac0d58c0a6b772 to your computer and use it in GitHub Desktop.
Python Script for downloading and organizing images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/ | |
# | |
# Download images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx | |
# The image urls are outdaed in the Excel sheet but the painting urls are not, | |
# so this script re-crawls those images and downloads them locally. | |
# It works as of July 2020. | |
# | |
# Run this first with: | |
# $ scrapy runspider paintings_crawl.py -o paintings.json | |
# Images are stored in 'out/raw' | |
# | |
# Then optionally create the appropriate folders with: | |
# $ python paintings_extract.py | |
THE_PAINTINGS_DATASET_URL = ( | |
"https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx" | |
) | |
import logging | |
import os | |
import urllib | |
import pandas as pd | |
import scrapy | |
logging.basicConfig(level=logging.INFO) | |
outdir = "out/raw" | |
os.makedirs(outdir, exist_ok=True) | |
filename = "paintings.xlsx" | |
if not os.path.exists(filename): | |
logging.info("Downloading paintings dataset CSV.") | |
urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename) | |
df = pd.read_excel(filename) | |
image_urls = df["Web page URL"] | |
logging.info(f"Number of urls to crawl: {len(image_urls)}") | |
class PaintingsSpider(scrapy.Spider): | |
name = "paintingsspider" | |
custom_settings = { | |
"ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1}, | |
"IMAGES_STORE": outdir, | |
"LOG_LEVEL": "INFO", | |
} | |
start_urls = list(image_urls) | |
def parse(self, response): | |
for div in response.css("div.single_img"): | |
image_src = div.css("img::attr(src)").extract_first() | |
if image_src: | |
logging.info(f"Found image {image_src}") | |
yield {"image_urls": [image_src], "url": response.request.url} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/ | |
# | |
# Run the crawler first: | |
# $ scrapy runspider paintings_crawl.py -o paintings.json | |
# | |
# Then optionally run this file: | |
# $ python paintings_extract.py | |
# Images are stored in out/organized folder. | |
import json | |
import logging | |
import os | |
import shutil | |
import pandas as pd | |
logging.basicConfig(level=logging.INFO) | |
rawdir = "out/raw" | |
outdir = "out/organized" | |
os.makedirs(outdir, exist_ok=True) | |
df = pd.read_excel("paintings.xlsx") | |
df = df.groupby("Web page URL").first() | |
with open("paintings.json") as f: | |
crawl_infos = json.load(f) | |
for crawl_info in crawl_infos: | |
row = df.loc[crawl_info["url"]] | |
labels = str(row["Labels"]) | |
labels = [l.strip().replace("'", "") for l in labels.split(" ")] | |
labels = [l for l in labels if l] | |
for image in crawl_info["images"]: | |
for label in labels: | |
labeldir = os.path.join(outdir, label) | |
os.makedirs(labeldir, exist_ok=True) | |
shutil.copyfile( | |
os.path.join(rawdir, image["path"]), | |
os.path.join(labeldir, os.path.basename(image["path"])), | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment