%pip install transformers
Reference: https://huggingface.co/facebook/dino-vits16
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import requests
import torch
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vits16')
model = ViTModel.from_pretrained('facebook/dino-vits16', add_pooling_layer=False)
# Remove the pooler
# NB: Actually, not needed if `add_pooling_layer=False`
# model.pooler = torch.nn.Identity()
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
last_hidden_states.size()
torch.Size([1, 197, 384])
Reference: https://github.com/facebookresearch/dino#pretrained-models-on-pytorch-hub
import torch
vits16 = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
# Apply to the same image as with HuggingFace
out = vits16(inputs['pixel_values'])
out.size()
torch.Size([1, 384])
Compare:
import numpy as np
r = []
for i in range(outputs.last_hidden_state.size()[1]):
d = abs(out - outputs.last_hidden_state[:, i, :]).max()
r.append(d)
for i in np.array(r).argsort()[:3]:
print(f'{i} -> {r[i]}')
Output: the [CLS] token (indexed 0) is clearly the output of the official model.
0 -> 0.02411365509033203
122 -> 18.719844818115234
130 -> 18.75490951538086
import numpy as np
cls = outputs.last_hidden_state[:, 0, :]
avg = outputs.last_hidden_state[:, 1:, :].mean(1)
d_cls = abs(out - cls).max()
d_avg = abs(out - avg).max()
print(d_cls)
print(d_avg)
tensor(0.0241, grad_fn=<MaxBackward1>)
tensor(22.0325, grad_fn=<MaxBackward1>)