dlebech · July 26, 2020 16:00
diff --git a/paintings_crawl.py b/paintings_crawl.py
 # Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
 #
 # Download images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx
 # The image urls are outdaed in the Excel sheet but the painting urls are not,
 # so this script re-crawls those images and downloads them locally.
 # It works as of July 2020.
 #
 # Run this first with:
 # $ scrapy runspider paintings_crawl.py -o paintings.json
 # Images are stored in 'out/raw'
 #
 # Then optionally create the appropriate folders with:
 # $ python paintings_extract.py

 THE_PAINTINGS_DATASET_URL = (
    "https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx"
 )

 import logging
 import os
 import urllib

 import pandas as pd
 import scrapy

 logging.basicConfig(level=logging.INFO)


 outdir = "out/raw"
 os.makedirs(outdir, exist_ok=True)

 filename = "paintings.xlsx"
 if not os.path.exists(filename):
    logging.info("Downloading paintings dataset CSV.")
    urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename)
 df = pd.read_excel(filename)

 image_urls = df["Web page URL"]
 logging.info(f"Number of urls to crawl: {len(image_urls)}")


 class PaintingsSpider(scrapy.Spider):
    name = "paintingsspider"

    custom_settings = {
        "ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
        "IMAGES_STORE": outdir,
        "LOG_LEVEL": "INFO",
    }

    start_urls = list(image_urls)

    def parse(self, response):
        for div in response.css("div.single_img"):
            image_src = div.css("img::attr(src)").extract_first()
            if image_src:
                logging.info(f"Found image {image_src}")
                yield {"image_urls": [image_src], "url": response.request.url}
diff --git a/paintings_extract.py b/paintings_extract.py
 # Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
 #
 # Run the crawler first:
 # $ scrapy runspider paintings_crawl.py -o paintings.json
 #
 # Then optionally run this file:
 # $ python paintings_extract.py
 # Images are stored in out/organized folder.

 import json
 import logging
 import os
 import shutil

 import pandas as pd

 logging.basicConfig(level=logging.INFO)

 rawdir = "out/raw"
 outdir = "out/organized"
 os.makedirs(outdir, exist_ok=True)

 df = pd.read_excel("paintings.xlsx")
 df = df.groupby("Web page URL").first()

 with open("paintings.json") as f:
    crawl_infos = json.load(f)

 for crawl_info in crawl_infos:
    row = df.loc[crawl_info["url"]]
    labels = str(row["Labels"])
    labels = [l.strip().replace("'", "") for l in labels.split(" ")]
    labels = [l for l in labels if l]
    for image in crawl_info["images"]:
        for label in labels:
            labeldir = os.path.join(outdir, label)
            os.makedirs(labeldir, exist_ok=True)
            shutil.copyfile(
                os.path.join(rawdir, image["path"]),
                os.path.join(labeldir, os.path.basename(image["path"])),
            )
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	#
	# Download images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx
	# The image urls are outdaed in the Excel sheet but the painting urls are not,
	# so this script re-crawls those images and downloads them locally.
	# It works as of July 2020.
	#
	# Run this first with:
	# $ scrapy runspider paintings_crawl.py -o paintings.json
	# Images are stored in 'out/raw'
	#
	# Then optionally create the appropriate folders with:
	# $ python paintings_extract.py

	THE_PAINTINGS_DATASET_URL = (
	"https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx"
	)

	import logging
	import os
	import urllib

	import pandas as pd
	import scrapy

	logging.basicConfig(level=logging.INFO)


	outdir = "out/raw"
	os.makedirs(outdir, exist_ok=True)

	filename = "paintings.xlsx"
	if not os.path.exists(filename):
	logging.info("Downloading paintings dataset CSV.")
	urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename)
	df = pd.read_excel(filename)

	image_urls = df["Web page URL"]
	logging.info(f"Number of urls to crawl: {len(image_urls)}")


	class PaintingsSpider(scrapy.Spider):
	name = "paintingsspider"

	custom_settings = {
	"ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
	"IMAGES_STORE": outdir,
	"LOG_LEVEL": "INFO",
	}

	start_urls = list(image_urls)

	def parse(self, response):
	for div in response.css("div.single_img"):
	image_src = div.css("img::attr(src)").extract_first()
	if image_src:
	logging.info(f"Found image {image_src}")
	yield {"image_urls": [image_src], "url": response.request.url}
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	#
	# Run the crawler first:
	# $ scrapy runspider paintings_crawl.py -o paintings.json
	#
	# Then optionally run this file:
	# $ python paintings_extract.py
	# Images are stored in out/organized folder.

	import json
	import logging
	import os
	import shutil

	import pandas as pd

	logging.basicConfig(level=logging.INFO)

	rawdir = "out/raw"
	outdir = "out/organized"
	os.makedirs(outdir, exist_ok=True)

	df = pd.read_excel("paintings.xlsx")
	df = df.groupby("Web page URL").first()

	with open("paintings.json") as f:
	crawl_infos = json.load(f)

	for crawl_info in crawl_infos:
	row = df.loc[crawl_info["url"]]
	labels = str(row["Labels"])
	labels = [l.strip().replace("'", "") for l in labels.split(" ")]
	labels = [l for l in labels if l]
	for image in crawl_info["images"]:
	for label in labels:
	labeldir = os.path.join(outdir, label)
	os.makedirs(labeldir, exist_ok=True)
	shutil.copyfile(
	os.path.join(rawdir, image["path"]),
	os.path.join(labeldir, os.path.basename(image["path"])),
	)