Ray Wang budui

Objective

Informal (vibes-based) evaluation of the following vision-language-model captioners:

	# Contains MIT-licensed code from wandb
	# https://github.com/wandb/wandb/blob/main/LICENSE
	# This gist is MIT-licensed (Copyright Alex Birch)

	from torch import Tensor, FloatTensor
	from torch.nn import Module
	from torch.utils.hooks import RemovableHandle
	import torch
	from typing import List, Callable, Dict, Sequence, Optional, Tuple, Any
	from wandb.wandb_torch import log_track_init, log_track_update

	import torch


	class TwoDimRotary(torch.nn.Module):
	def __init__(self, dim, base=100, h = 128, w = 128):
	super().__init__()
	self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / (dim)))
	self.h = h
	self.w = w