Informal (vibes-based) evaluation of the following vision-language-model captioners:
- Florence-2-base-ft
- CogVLM2
- BLIP-2
- MoonDream2
- Share-Captioner
- Florence-2-SD3-Captioner
# Contains MIT-licensed code from wandb | |
# https://github.com/wandb/wandb/blob/main/LICENSE | |
# This gist is MIT-licensed (Copyright Alex Birch) | |
from torch import Tensor, FloatTensor | |
from torch.nn import Module | |
from torch.utils.hooks import RemovableHandle | |
import torch | |
from typing import List, Callable, Dict, Sequence, Optional, Tuple, Any | |
from wandb.wandb_torch import log_track_init, log_track_update |
import torch | |
class TwoDimRotary(torch.nn.Module): | |
def __init__(self, dim, base=100, h = 128, w = 128): | |
super().__init__() | |
self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / (dim))) | |
self.h = h | |
self.w = w | |