Skip to content

Instantly share code, notes, and snippets.

@unbracketed
Created February 3, 2025 23:31
Show Gist options
  • Save unbracketed/1b001f5b655a9e65c7a5201a33c0ff91 to your computer and use it in GitHub Desktop.
Save unbracketed/1b001f5b655a9e65c7a5201a33c0ff91 to your computer and use it in GitHub Desktop.
A Python script that will apply image-to-text transformations for any images found in the directory it runs in using a few different models for comparison.
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "torch",
# "transformers",
# "Pillow",
# ]
# ///
from pathlib import Path
import torch
from transformers import pipeline
# List of models to test
MODELS = [
"ydshieh/vit-gpt2-coco-en",
"Salesforce/blip-image-captioning-large",
"microsoft/git-base-coco",
]
def process_image(image_path: Path, models):
"""Process a single image through multiple models."""
print(f"\n{image_path.name}")
for model_name in models:
try:
# Initialize model
captioner = pipeline(model=model_name, device=0 if torch.cuda.is_available() else -1)
# Generate caption
caption = captioner(str(image_path))
# Get caption text
caption_text = caption[0]['generated_text'] if isinstance(caption, list) else caption
print(f"{model_name}: {caption_text}")
except Exception as e:
print(f"{model_name}: Error: {str(e)}")
def main():
"""Process all images in current directory."""
# Get all image files in current directory
image_files = list(Path().glob('*.jpg')) + list(Path().glob('*.png'))
if not image_files:
print("No image files (jpg/png) found in current directory!")
return
# Process each image
for img_path in image_files:
process_image(img_path, MODELS)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment