Skip to content

Instantly share code, notes, and snippets.

@dartt0n
Last active March 19, 2025 16:53
Show Gist options
  • Save dartt0n/ee8971e2e9a0696bf5ce108bf2f9a138 to your computer and use it in GitHub Desktop.
Save dartt0n/ee8971e2e9a0696bf5ce108bf2f9a138 to your computer and use it in GitHub Desktop.
Disk caching (compatible with any`load_image_from_url` implementation) for Information Retrieval'24 course @ Innopolis University

Precomputed Hashes

You can download precomputed image caches (compressed 8.35GB, uncompressed 12.45GB): Yandex Cloud: https://storage.yandexcloud.net/ir24-precomputed-hashes/image_cache.zip (please do not DDOS this link, I pay for that from my own money)

For faster experiments with Kaggle/Colab, add the following code to download caches:

!wget https://storage.yandexcloud.net/ir24-precomputed-hashes/image_cache.zip
!unzip image_cache.zip

Once downloaded, unzip the archive and specify the correct path in your notebook, e.g.

cache = FileCache(Path("image_cache"))

Important

Each file is a numpy array of shape (H, W, 3) and dtype=uint8. Both width and height are preserved from original images.

import hashlib
import numpy as np
from typing import Callable
from tempfile import TemporaryDirectory
from pathlib import Path
class FileCache:
def __init__(
self, cache_dir: Path | None = None, key_fn: Callable[[str], str] | None = None
):
# use specifed cache directory or create a temporary directory
self._cache_dir = cache_dir or Path(TemporaryDirectory().name)
self._cache_dir.mkdir(parents=True, exist_ok=True)
self._cache_keys = set()
self._key_fn = key_fn or self.__default_key_fn
# load existing caches from disk
for file in self._cache_dir.glob("*.npy"):
self._cache_keys.add(file.stem)
@staticmethod
def __default_key_fn(url: str) -> str:
# sha256 hash of the URL
return hashlib.sha256(url.encode()).hexdigest()
def __call__(
self, download_func: Callable[[str], np.ndarray]
) -> Callable[[str], np.ndarray]:
def wrapper(url: str) -> np.ndarray:
key = self._key_fn(url)
# if key exists
if key in self._cache_keys:
# load numpy array from file
return np.load(self._cache_dir / (key + ".npy"))
# otherwise download and save
np_data = download_func(url)
self._cache_keys.add(key)
np.save(self._cache_dir / (key + ".npy"), np_data)
return np_data
return wrapper
def clean_cache(self):
# delete each known cache entry
for key in self._cache_keys:
file_path = self._cache_dir / (key + ".npy")
if file_path.exists():
file_path.unlink()
self._cache_keys.clear()
cache = FileCache(Path("image_cache"))
@cache
def load_image_from_url(url: str) -> np.ndarray:
... # todo: your implementation of load_image_from_url
@dartt0n
Copy link
Author

dartt0n commented Dec 8, 2024

Cache for sentences:

class SentencesFileCache():
    def __init__(self, cache_dir: Path | None = None, key_fn: Callable[[list[str]], str] | None = None):
        self._cache_dir = cache_dir or Path(TemporaryDirectory().name)
        self._cache_dir.mkdir(parents=True, exist_ok=True)

        self._cache_keys = set()
        self._key_fn = key_fn or self.__default_key_fn

        for file in self._cache_dir.glob("*.npy"):
            self._cache_keys.add(file.stem)

    def __call__(
        self, compute_func: Callable[[list[str]], np.ndarray]
    ) -> Callable[[str], np.ndarray]:
        def wrapper(texts: list[str]) -> np.ndarray:
            key = self._key_fn(texts)
            # if key exists
            if key in self._cache_keys:
                # load numpy array from file
                return np.load(self._cache_dir / (key + ".npy"))

            # otherwise compute and save
            np_data = compute_func(texts)
            self._cache_keys.add(key)
            np.save(self._cache_dir / (key + ".npy"), np_data)
            return np_data

        return wrapper

    def clean_cache(self):
        # delete each known cache entry
        for key in self._cache_keys:
            file_path = self._cache_dir / (key + ".npy")
            if file_path.exists():
                file_path.unlink()
        self._cache_keys.clear()


    @staticmethod
    def __default_key_fn(sentences: list[str]) -> str:
        # sha256 hash of the URL
        return hashlib.sha256('\n'.join(sentences).encode()).hexdigest()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment