Skip to content

Instantly share code, notes, and snippets.

@ShawonAshraf
Created September 27, 2022 15:01
Show Gist options
  • Save ShawonAshraf/650069d8545f1fc032fe52aa8fd51baa to your computer and use it in GitHub Desktop.
Save ShawonAshraf/650069d8545f1fc032fe52aa8fd51baa to your computer and use it in GitHub Desktop.
import torch.nn as nn
from einops.layers.torch import Rearrange
from transformers import AutoModel
class DummyModel(nn.Module):
def __init__(self, textmodel_weight_name, task_name, batch_size, freeze_text_model=False):
super(DummyModel, self).__init__()
self.task_name = task_name
self.text_model_name = textmodel_weight_name
self.batch_size = batch_size
self.text_model = AutoModel.from_pretrained(
textmodel_weight_name)
if freeze_text_model:
for param in self.text_model.parameters(): # type: ignore
param.requires_grad = False
# project text features to image feature dimension
self.steps_projector_mlp = nn.Sequential(
nn.Linear(768, 512),
nn.Dropout(0.1),
nn.Linear(512, 512),
nn.GELU(),
Rearrange("batch dim -> batch 1 dim")
)
# transformer for attention
self.transformer = nn.Transformer(
nhead=16, batch_first=True)
self.cosine_sim = nn.CosineSimilarity(dim=-1)
self.softmax = nn.Softmax(dim=0)
def forward(self, steps_input_id, steps_attn_mask, questions, answer):
step_features = self.text_model(
steps_input_id, steps_attn_mask)
step_features = step_features.pooler_output
steps_projected = self.steps_projector_mlp(step_features)
# attention embedding for images guided by text in steps
transformer_out = self.transformer(
steps_projected, questions)
# cosine similarity between transformer_out and answer
similarity = self.cosine_sim(transformer_out, answer)
return self.softmax(similarity)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment