Skip to content

Instantly share code, notes, and snippets.

@dlebech
Created July 26, 2020 16:00
Show Gist options
  • Save dlebech/4a50650b7d594553cdac0d58c0a6b772 to your computer and use it in GitHub Desktop.
Save dlebech/4a50650b7d594553cdac0d58c0a6b772 to your computer and use it in GitHub Desktop.
Python Script for downloading and organizing images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
#
# Download images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx
# The image urls are outdaed in the Excel sheet but the painting urls are not,
# so this script re-crawls those images and downloads them locally.
# It works as of July 2020.
#
# Run this first with:
# $ scrapy runspider paintings_crawl.py -o paintings.json
# Images are stored in 'out/raw'
#
# Then optionally create the appropriate folders with:
# $ python paintings_extract.py
THE_PAINTINGS_DATASET_URL = (
"https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx"
)
import logging
import os
import urllib
import pandas as pd
import scrapy
logging.basicConfig(level=logging.INFO)
outdir = "out/raw"
os.makedirs(outdir, exist_ok=True)
filename = "paintings.xlsx"
if not os.path.exists(filename):
logging.info("Downloading paintings dataset CSV.")
urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename)
df = pd.read_excel(filename)
image_urls = df["Web page URL"]
logging.info(f"Number of urls to crawl: {len(image_urls)}")
class PaintingsSpider(scrapy.Spider):
name = "paintingsspider"
custom_settings = {
"ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
"IMAGES_STORE": outdir,
"LOG_LEVEL": "INFO",
}
start_urls = list(image_urls)
def parse(self, response):
for div in response.css("div.single_img"):
image_src = div.css("img::attr(src)").extract_first()
if image_src:
logging.info(f"Found image {image_src}")
yield {"image_urls": [image_src], "url": response.request.url}
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
#
# Run the crawler first:
# $ scrapy runspider paintings_crawl.py -o paintings.json
#
# Then optionally run this file:
# $ python paintings_extract.py
# Images are stored in out/organized folder.
import json
import logging
import os
import shutil
import pandas as pd
logging.basicConfig(level=logging.INFO)
rawdir = "out/raw"
outdir = "out/organized"
os.makedirs(outdir, exist_ok=True)
df = pd.read_excel("paintings.xlsx")
df = df.groupby("Web page URL").first()
with open("paintings.json") as f:
crawl_infos = json.load(f)
for crawl_info in crawl_infos:
row = df.loc[crawl_info["url"]]
labels = str(row["Labels"])
labels = [l.strip().replace("'", "") for l in labels.split(" ")]
labels = [l for l in labels if l]
for image in crawl_info["images"]:
for label in labels:
labeldir = os.path.join(outdir, label)
os.makedirs(labeldir, exist_ok=True)
shutil.copyfile(
os.path.join(rawdir, image["path"]),
os.path.join(labeldir, os.path.basename(image["path"])),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment