Skip to content

Instantly share code, notes, and snippets.

@dartt0n
Last active March 19, 2025 16:53
Show Gist options
  • Save dartt0n/ee8971e2e9a0696bf5ce108bf2f9a138 to your computer and use it in GitHub Desktop.
Save dartt0n/ee8971e2e9a0696bf5ce108bf2f9a138 to your computer and use it in GitHub Desktop.
Disk caching (compatible with any`load_image_from_url` implementation) for Information Retrieval'24 course @ Innopolis University

Precomputed Hashes

You can download precomputed image caches (compressed 8.35GB, uncompressed 12.45GB): Yandex Cloud: https://storage.yandexcloud.net/ir24-precomputed-hashes/image_cache.zip (please do not DDOS this link, I pay for that from my own money)

For faster experiments with Kaggle/Colab, add the following code to download caches:

!wget https://storage.yandexcloud.net/ir24-precomputed-hashes/image_cache.zip
!unzip image_cache.zip

Once downloaded, unzip the archive and specify the correct path in your notebook, e.g.

cache = FileCache(Path("image_cache"))

Important

Each file is a numpy array of shape (H, W, 3) and dtype=uint8. Both width and height are preserved from original images.

import hashlib
import numpy as np
from typing import Callable
from tempfile import TemporaryDirectory
from pathlib import Path
class FileCache:
def __init__(
self, cache_dir: Path | None = None, key_fn: Callable[[str], str] | None = None
):
# use specifed cache directory or create a temporary directory
self._cache_dir = cache_dir or Path(TemporaryDirectory().name)
self._cache_dir.mkdir(parents=True, exist_ok=True)
self._cache_keys = set()
self._key_fn = key_fn or self.__default_key_fn
# load existing caches from disk
for file in self._cache_dir.glob("*.npy"):
self._cache_keys.add(file.stem)
@staticmethod
def __default_key_fn(url: str) -> str:
# sha256 hash of the URL
return hashlib.sha256(url.encode()).hexdigest()
def __call__(
self, download_func: Callable[[str], np.ndarray]
) -> Callable[[str], np.ndarray]:
def wrapper(url: str) -> np.ndarray:
key = self._key_fn(url)
# if key exists
if key in self._cache_keys:
# load numpy array from file
return np.load(self._cache_dir / (key + ".npy"))
# otherwise download and save
np_data = download_func(url)
self._cache_keys.add(key)
np.save(self._cache_dir / (key + ".npy"), np_data)
return np_data
return wrapper
def clean_cache(self):
# delete each known cache entry
for key in self._cache_keys:
file_path = self._cache_dir / (key + ".npy")
if file_path.exists():
file_path.unlink()
self._cache_keys.clear()
cache = FileCache(Path("image_cache"))
@cache
def load_image_from_url(url: str) -> np.ndarray:
... # todo: your implementation of load_image_from_url
@dartt0n
Copy link
Author

dartt0n commented Nov 23, 2024

The same cache could be applied to encode_image function:

emb_cache = FileCache(Path(".embedding_cache")) # note: create new cache so that it does not conflict with previous one

@emb_cache
def encode_image(image: str) -> np.ndarray:
    ...

Important

Create new instance of FileCache so that caches between functions do not conflict (since they have the same URLs as keys)

@dartt0n
Copy link
Author

dartt0n commented Nov 23, 2024

The following example does the following:

  1. Reads file from the local file system or downloads it from the specified url
  2. Reads image as Pillow.Image object and converts to RGB format
  3. Converts image to numpy array and caches numpy array on disk
  4. Generate embeddings for the image and caches them on disk as well (as numpy arrays)

Important

Do not forget to change load_image_from_url function implementation to your own after the first run (after the first run all embeddings and image vectors would be cached on disk, so any further call would load them from disk and will not do heavy computations)

# load your model here
import hashlib
from typing import Callable
import requests
from tempfile import TemporaryDirectory
from pathlib import Path
from PIL import Image

class FileCache:
    def __init__(
        self, cache_dir: Path | None = None, key_fn: Callable[[str], str] | None = None
    ):
        # use specifed cache directory or create a temporary directory
        self._cache_dir = cache_dir or Path(TemporaryDirectory().name)
        self._cache_dir.mkdir(parents=True, exist_ok=True)

        self._cache_keys = set()
        self._key_fn = key_fn or self.__default_key_fn

        # load existing caches from disk
        for file in self._cache_dir.glob("*.npy"):
            self._cache_keys.add(file.stem)

    @staticmethod
    def __default_key_fn(url: str) -> str:
        # sha256 hash of the URL
        return hashlib.sha256(url.encode()).hexdigest()

    def __call__(
        self, download_func: Callable[[str], np.ndarray]
    ) -> Callable[[str], np.ndarray]:
        def wrapper(url: str) -> np.ndarray:
            key = self._key_fn(url)
            # if key exists
            if key in self._cache_keys:
                # load numpy array from file
                return np.load(self._cache_dir / (key + ".npy"))

            # otherwise download and save
            np_data = download_func(url)
            self._cache_keys.add(key)
            np.save(self._cache_dir / (key + ".npy"), np_data)
            return np_data

        return wrapper

    def clean_cache(self):
        # delete each known cache entry
        for key in self._cache_keys:
            file_path = self._cache_dir / (key + ".npy")
            if file_path.exists():
                file_path.unlink()
        self._cache_keys.clear()


img_cache = FileCache(Path(".image_cache"))

@img_cache
def load_image_from_url(url: str) -> np.ndarray:
    """Loads an image from a given URL and converts it to an RGB NumPy array.

    Args:
        url: The URL of the image to load.

    Returns:
        A NumPy array representing the image in RGB format.
    """
    dir = Path('images') # use images.zip prepared by Ruslan Izmailov, Bulat Akhmatov and Alexandra Vabnits
    filename = url.split('/')[-1]
    
    if not (dir / filename).exists():
        with open(dir + filename, 'wb') as f:
            f.write(requests.get(url).content)
    
    image = Image.open(dir / filename).convert('RGB')
    return np.array(image)


emb_cache = FileCache(Path(".emb_cache"))

@emb_cache
def encode_image(image: str) -> np.ndarray:
    """Encodes a given image URL into an embedding vector.

    Args:
        image:  The URL of the image to encode.

    Returns:
         A NumPy array representing the image embedding.
    """
    image = load_image_from_url(image)
    ... # todo: your implemenation


# Below call the functions in the loop to create embedding
for i in tqdm(range(len(ds))):
    ds[i]["image_embedding"] = encode_image(ds[i]["url"])

@dartt0n
Copy link
Author

dartt0n commented Dec 8, 2024

Cache for sentences:

class SentencesFileCache():
    def __init__(self, cache_dir: Path | None = None, key_fn: Callable[[list[str]], str] | None = None):
        self._cache_dir = cache_dir or Path(TemporaryDirectory().name)
        self._cache_dir.mkdir(parents=True, exist_ok=True)

        self._cache_keys = set()
        self._key_fn = key_fn or self.__default_key_fn

        for file in self._cache_dir.glob("*.npy"):
            self._cache_keys.add(file.stem)

    def __call__(
        self, compute_func: Callable[[list[str]], np.ndarray]
    ) -> Callable[[str], np.ndarray]:
        def wrapper(texts: list[str]) -> np.ndarray:
            key = self._key_fn(texts)
            # if key exists
            if key in self._cache_keys:
                # load numpy array from file
                return np.load(self._cache_dir / (key + ".npy"))

            # otherwise compute and save
            np_data = compute_func(texts)
            self._cache_keys.add(key)
            np.save(self._cache_dir / (key + ".npy"), np_data)
            return np_data

        return wrapper

    def clean_cache(self):
        # delete each known cache entry
        for key in self._cache_keys:
            file_path = self._cache_dir / (key + ".npy")
            if file_path.exists():
                file_path.unlink()
        self._cache_keys.clear()


    @staticmethod
    def __default_key_fn(sentences: list[str]) -> str:
        # sha256 hash of the URL
        return hashlib.sha256('\n'.join(sentences).encode()).hexdigest()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment