Skip to content

Instantly share code, notes, and snippets.

@trojblue
Last active March 31, 2025 22:01
Show Gist options
  • Save trojblue/a4896792976983f756baeee4422bab04 to your computer and use it in GitHub Desktop.
Save trojblue/a4896792976983f756baeee4422bab04 to your computer and use it in GitHub Desktop.

EvalClip-Eval

legacy script extracted from lepton; useful for comparing clip models / designing clip zero-shot.

script:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
CLIP Image-Text Matching Script

This script provides functionality to use different CLIP (Contrastive Language-Image Pretraining) 
models for image-text matching tasks. It supports both EVA-CLIP and OpenCLIP models.

Environment setup:
------------------
See: https://github.com/baaivision/EVA/tree/master/EVA-CLIP#setup

```bash
conda create --name evaclip python=3.8 -y
conda activate evaclip

git clone https://github.com/baaivision/EVA 
cd EVA/EVA-CLIP
pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
pip install -r requirements.txt

For OpenCLIP models, you can use various pretrained models including the large ViT variants. """

import argparse import torch from PIL import Image import gradio as gr import os import sys

def parse_args(): parser = argparse.ArgumentParser(description="CLIP Image-Text Matching") parser.add_argument("--model-type", type=str, default="openclip", choices=["evaclip", "openclip"], help="Type of CLIP model to use (evaclip or openclip)") parser.add_argument("--model-name", type=str, default="ViT-bigG-14", help="Model name or architecture") parser.add_argument("--pretrained", type=str, default="laion2b_s39b_b160k", help="Pretrained model weights or identifier") parser.add_argument("--image", type=str, default=None, help="Path to the image for inference") parser.add_argument("--descriptions", type=str, default="a dog,a cat", help="Comma-separated text descriptions to match with the image") parser.add_argument("--web-ui", action="store_true", help="Launch web UI using Gradio instead of CLI")

return parser.parse_args()

def load_model(model_type, model_name, pretrained): """ Load the specified CLIP model.

Args:
    model_type: 'evaclip' or 'openclip'
    model_name: Model architecture name
    pretrained: Pretrained weights identifier or path
    
Returns:
    tuple: (model, preprocess_function, tokenizer)
"""
if model_type == "evaclip":
    try:
        sys.path.append(os.path.join(os.path.dirname(__file__), "EVA", "EVA-CLIP"))
        from eva_clip import create_model_and_transforms, get_tokenizer
        print(f"Loading EVA-CLIP model: {model_name}")
        force_custom = True if "EVA" in model_name else False
        model, _, preprocess = create_model_and_transforms(
            model_name, pretrained, force_custom_clip=force_custom
        )
        tokenizer = get_tokenizer(model_name)
    except ImportError:
        print("Error: EVA-CLIP is not installed properly. Please follow the setup instructions.")
        sys.exit(1)
else:  # openclip
    try:
        import open_clip
        print(f"Loading OpenCLIP model: {model_name}")
        model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
        tokenizer = open_clip.get_tokenizer(model_name)
    except ImportError:
        print("Error: open_clip is not installed. Please install with 'pip install open-clip-torch'.")
        sys.exit(1)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

return model, preprocess, tokenizer, device

def classify_image(model, preprocess, tokenizer, image_input, descriptions, device="cuda"): """ Process an image and match it against text descriptions.

Args:
    model: CLIP model
    preprocess: Image preprocessing function
    tokenizer: Text tokenization function
    image_input: PIL Image or path to image
    descriptions: Comma-separated text descriptions
    device: Computation device
    
Returns:
    dict: Text descriptions with their matching probabilities
"""
# Handle different input types
if isinstance(image_input, str):
    try:
        image = Image.open(image_input)
    except Exception as e:
        raise ValueError(f"Error opening image file: {e}")
else:
    image = image_input

# Process the image
img = preprocess(image).unsqueeze(0).to(device)

# Process the descriptions
if isinstance(descriptions, str):
    desc_list = [d.strip() for d in descriptions.split(',')]
else:
    desc_list = descriptions

text = tokenizer(desc_list).to(device)

# Perform inference
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(img)
    text_features = model.encode_text(text)
    
    # Normalize features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    # Calculate similarity
    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

# Convert to dictionary of results
probabilities = text_probs.squeeze().tolist()

# Handle the case when there's only one description
if not isinstance(probabilities, list):
    probabilities = [probabilities]

results = {desc: prob for desc, prob in zip(desc_list, probabilities)}
return results

def setup_web_ui(model, preprocess, tokenizer, device): """ Set up and launch a Gradio web interface for the CLIP model.

Args:
    model: CLIP model
    preprocess: Image preprocessing function
    tokenizer: Text tokenization function
    device: Computation device
"""
def gradio_wrapper(image, descriptions):
    return classify_image(model, preprocess, tokenizer, image, descriptions, device)

interface = gr.Interface(
    fn=gradio_wrapper,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Enter descriptions separated by commas", placeholder="a dog, a cat, a bird...")
    ],
    outputs=gr.Label(num_top_classes=10),
    title="CLIP Image-Text Matching",
    description="Upload an image and input descriptions to see how well they match according to the CLIP model."
)

interface.launch(share=True)

def main(): args = parse_args()

# Load the model
model, preprocess, tokenizer, device = load_model(args.model_type, args.model_name, args.pretrained)

# Either launch web UI or run CLI inference
if args.web_ui:
    setup_web_ui(model, preprocess, tokenizer, device)
elif args.image:
    # Run inference on a single image
    results = classify_image(model, preprocess, tokenizer, args.image, args.descriptions, device)
    
    # Print results
    print("Image-text matching results:")
    for desc, prob in results.items():
        print(f"- {desc}: {prob:.4f}")
else:
    print("Error: Either provide an image path with --image or use --web-ui")

def list_available_models(): """ List all available pretrained CLIP models from OpenCLIP. """ try: import open_clip models = open_clip.list_pretrained() print("Available OpenCLIP models:") for model, pretrained in models: print(f"- {model}, {pretrained}") except ImportError: print("open_clip is not installed. Please install with 'pip install open-clip-torch'.")

if name == "main": # If no arguments are provided, show the help message if len(sys.argv) == 1: print(doc) print("\nTo see a list of available OpenCLIP models, run with --list-models") print("To see all available options, run with --help") elif len(sys.argv) == 2 and sys.argv[1] == "--list-models": list_available_models() else: main()



setup:

#!/bin/bash

Set up environment for CLIP models

Function to display help message

show_help() { echo "CLIP Model Setup Script" echo "========================" echo "This script sets up the environment for using CLIP models." echo echo "Usage:" echo " ./setup.sh [options]" echo echo "Options:" echo " --eva-clip Set up environment for EVA-CLIP" echo " --open-clip Set up environment for OpenCLIP" echo " --all Set up environment for both models (default)" echo " --help Display this help message" }

Default is to install all

INSTALL_EVA_CLIP=true INSTALL_OPEN_CLIP=true

Parse command-line arguments

for arg in "$@"; do case $arg in --eva-clip) INSTALL_EVA_CLIP=true INSTALL_OPEN_CLIP=false ;; --open-clip) INSTALL_EVA_CLIP=false INSTALL_OPEN_CLIP=true ;; --all) INSTALL_EVA_CLIP=true INSTALL_OPEN_CLIP=true ;; --help) show_help exit 0 ;; *) echo "Unknown option: $arg" show_help exit 1 ;; esac done

Create and activate conda environment

echo "Creating conda environment..." conda create --name clip_env python=3.8 -y eval "$(conda shell.bash hook)" conda activate clip_env

Install PyTorch

echo "Installing PyTorch..." pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116

Install common dependencies

pip install pillow gradio

Install EVA-CLIP if selected

if [ "$INSTALL_EVA_CLIP" = true ]; then echo "Setting up EVA-CLIP..."

# Clone EVA repository
if [ ! -d "EVA" ]; then
    git clone https://github.com/baaivision/EVA
fi

# Install EVA-CLIP requirements
cd EVA/EVA-CLIP
pip install -r requirements.txt
cd ../..

fi

Install OpenCLIP if selected

if [ "$INSTALL_OPEN_CLIP" = true ]; then echo "Setting up OpenCLIP..." pip install open-clip-torch fi

echo "Setup complete! Activate the environment with: conda activate clip_env" echo "Run the CLIP script with: python clip_script.py --help"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment