andylolu2 · January 26, 2024 15:03
diff --git a/clip_loss.py b/clip_loss.py
 # b             - batch size
 # d             - feature dimension
 # t             - learned temperature parameter
 # image_encoder - ResNet or Vision Transformer
 # text_encoder  - CBOW or Text Transformer
 # I[B, h, w, c] - minibatch of aligned images
 # T[B, l]       - minibatch of aligned texts

 # extract feature representations of each modality
 F_i = image_encoder(I)  # [b, d]
 F_t = text_encoder(T)  # [b, d]

 # scaled pairwise cosine similarities [b, b]
 sim = cosine_similarity(F_i, F_t) * np.exp(t)

 # symmetric loss function
 loss_i = cross_entropy_loss(sim, np.arange(n))
 loss_t = cross_entropy_loss(sim.T, np.arange(n))
 loss = loss_i + loss_t
	# b - batch size
	# d - feature dimension
	# t - learned temperature parameter
	# image_encoder - ResNet or Vision Transformer
	# text_encoder - CBOW or Text Transformer
	# I[B, h, w, c] - minibatch of aligned images
	# T[B, l] - minibatch of aligned texts

	# extract feature representations of each modality
	F_i = image_encoder(I) # [b, d]
	F_t = text_encoder(T) # [b, d]

	# scaled pairwise cosine similarities [b, b]
	sim = cosine_similarity(F_i, F_t) * np.exp(t)

	# symmetric loss function
	loss_i = cross_entropy_loss(sim, np.arange(n))
	loss_t = cross_entropy_loss(sim.T, np.arange(n))
	loss = loss_i + loss_t