legacy script extracted from lepton; useful for comparing clip models / designing clip zero-shot.
script:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
CLIP Image-Text Matching Script
This script provides functionality to use different CLIP (Contrastive Language-Image Pretraining)
models for image-text matching tasks. It supports both EVA-CLIP and OpenCLIP models.
Environment setup:
------------------
See: https://github.com/baaivision/EVA/tree/master/EVA-CLIP#setup
```bash
conda create --name evaclip python=3.8 -y
conda activate evaclip
git clone https://github.com/baaivision/EVA
cd EVA/EVA-CLIP
pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
pip install -r requirements.txt
For OpenCLIP models, you can use various pretrained models including the large ViT variants. """
import argparse import torch from PIL import Image import gradio as gr import os import sys
def parse_args(): parser = argparse.ArgumentParser(description="CLIP Image-Text Matching") parser.add_argument("--model-type", type=str, default="openclip", choices=["evaclip", "openclip"], help="Type of CLIP model to use (evaclip or openclip)") parser.add_argument("--model-name", type=str, default="ViT-bigG-14", help="Model name or architecture") parser.add_argument("--pretrained", type=str, default="laion2b_s39b_b160k", help="Pretrained model weights or identifier") parser.add_argument("--image", type=str, default=None, help="Path to the image for inference") parser.add_argument("--descriptions", type=str, default="a dog,a cat", help="Comma-separated text descriptions to match with the image") parser.add_argument("--web-ui", action="store_true", help="Launch web UI using Gradio instead of CLI")
return parser.parse_args()
def load_model(model_type, model_name, pretrained): """ Load the specified CLIP model.
Args:
model_type: 'evaclip' or 'openclip'
model_name: Model architecture name
pretrained: Pretrained weights identifier or path
Returns:
tuple: (model, preprocess_function, tokenizer)
"""
if model_type == "evaclip":
try:
sys.path.append(os.path.join(os.path.dirname(__file__), "EVA", "EVA-CLIP"))
from eva_clip import create_model_and_transforms, get_tokenizer
print(f"Loading EVA-CLIP model: {model_name}")
force_custom = True if "EVA" in model_name else False
model, _, preprocess = create_model_and_transforms(
model_name, pretrained, force_custom_clip=force_custom
)
tokenizer = get_tokenizer(model_name)
except ImportError:
print("Error: EVA-CLIP is not installed properly. Please follow the setup instructions.")
sys.exit(1)
else: # openclip
try:
import open_clip
print(f"Loading OpenCLIP model: {model_name}")
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
tokenizer = open_clip.get_tokenizer(model_name)
except ImportError:
print("Error: open_clip is not installed. Please install with 'pip install open-clip-torch'.")
sys.exit(1)
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
return model, preprocess, tokenizer, device
def classify_image(model, preprocess, tokenizer, image_input, descriptions, device="cuda"): """ Process an image and match it against text descriptions.
Args:
model: CLIP model
preprocess: Image preprocessing function
tokenizer: Text tokenization function
image_input: PIL Image or path to image
descriptions: Comma-separated text descriptions
device: Computation device
Returns:
dict: Text descriptions with their matching probabilities
"""
# Handle different input types
if isinstance(image_input, str):
try:
image = Image.open(image_input)
except Exception as e:
raise ValueError(f"Error opening image file: {e}")
else:
image = image_input
# Process the image
img = preprocess(image).unsqueeze(0).to(device)
# Process the descriptions
if isinstance(descriptions, str):
desc_list = [d.strip() for d in descriptions.split(',')]
else:
desc_list = descriptions
text = tokenizer(desc_list).to(device)
# Perform inference
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(img)
text_features = model.encode_text(text)
# Normalize features
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# Calculate similarity
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
# Convert to dictionary of results
probabilities = text_probs.squeeze().tolist()
# Handle the case when there's only one description
if not isinstance(probabilities, list):
probabilities = [probabilities]
results = {desc: prob for desc, prob in zip(desc_list, probabilities)}
return results
def setup_web_ui(model, preprocess, tokenizer, device): """ Set up and launch a Gradio web interface for the CLIP model.
Args:
model: CLIP model
preprocess: Image preprocessing function
tokenizer: Text tokenization function
device: Computation device
"""
def gradio_wrapper(image, descriptions):
return classify_image(model, preprocess, tokenizer, image, descriptions, device)
interface = gr.Interface(
fn=gradio_wrapper,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Enter descriptions separated by commas", placeholder="a dog, a cat, a bird...")
],
outputs=gr.Label(num_top_classes=10),
title="CLIP Image-Text Matching",
description="Upload an image and input descriptions to see how well they match according to the CLIP model."
)
interface.launch(share=True)
def main(): args = parse_args()
# Load the model
model, preprocess, tokenizer, device = load_model(args.model_type, args.model_name, args.pretrained)
# Either launch web UI or run CLI inference
if args.web_ui:
setup_web_ui(model, preprocess, tokenizer, device)
elif args.image:
# Run inference on a single image
results = classify_image(model, preprocess, tokenizer, args.image, args.descriptions, device)
# Print results
print("Image-text matching results:")
for desc, prob in results.items():
print(f"- {desc}: {prob:.4f}")
else:
print("Error: Either provide an image path with --image or use --web-ui")
def list_available_models(): """ List all available pretrained CLIP models from OpenCLIP. """ try: import open_clip models = open_clip.list_pretrained() print("Available OpenCLIP models:") for model, pretrained in models: print(f"- {model}, {pretrained}") except ImportError: print("open_clip is not installed. Please install with 'pip install open-clip-torch'.")
if name == "main": # If no arguments are provided, show the help message if len(sys.argv) == 1: print(doc) print("\nTo see a list of available OpenCLIP models, run with --list-models") print("To see all available options, run with --help") elif len(sys.argv) == 2 and sys.argv[1] == "--list-models": list_available_models() else: main()
setup:
#!/bin/bash
show_help() { echo "CLIP Model Setup Script" echo "========================" echo "This script sets up the environment for using CLIP models." echo echo "Usage:" echo " ./setup.sh [options]" echo echo "Options:" echo " --eva-clip Set up environment for EVA-CLIP" echo " --open-clip Set up environment for OpenCLIP" echo " --all Set up environment for both models (default)" echo " --help Display this help message" }
INSTALL_EVA_CLIP=true INSTALL_OPEN_CLIP=true
for arg in "$@"; do case $arg in --eva-clip) INSTALL_EVA_CLIP=true INSTALL_OPEN_CLIP=false ;; --open-clip) INSTALL_EVA_CLIP=false INSTALL_OPEN_CLIP=true ;; --all) INSTALL_EVA_CLIP=true INSTALL_OPEN_CLIP=true ;; --help) show_help exit 0 ;; *) echo "Unknown option: $arg" show_help exit 1 ;; esac done
echo "Creating conda environment..." conda create --name clip_env python=3.8 -y eval "$(conda shell.bash hook)" conda activate clip_env
echo "Installing PyTorch..." pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
pip install pillow gradio
if [ "$INSTALL_EVA_CLIP" = true ]; then echo "Setting up EVA-CLIP..."
# Clone EVA repository
if [ ! -d "EVA" ]; then
git clone https://github.com/baaivision/EVA
fi
# Install EVA-CLIP requirements
cd EVA/EVA-CLIP
pip install -r requirements.txt
cd ../..
fi
if [ "$INSTALL_OPEN_CLIP" = true ]; then echo "Setting up OpenCLIP..." pip install open-clip-torch fi
echo "Setup complete! Activate the environment with: conda activate clip_env" echo "Run the CLIP script with: python clip_script.py --help"