Skip to content

Instantly share code, notes, and snippets.

@SuperSonicHub1
Created April 25, 2023 03:43
Show Gist options
  • Save SuperSonicHub1/6a757f9ec3c5722699e0584650d8db23 to your computer and use it in GitHub Desktop.
Save SuperSonicHub1/6a757f9ec3c5722699e0584650d8db23 to your computer and use it in GitHub Desktop.
What's is the least related thing to MIT?
=== MIT ===
'Ġthe' : 0.27957552671432495
'Ġand' : 0.324958860874176
'Ġin' : 0.3969202935695648
'Ġa' : 0.6269370913505554
'Ġ(' : 0.6961137056350708
=== mit ===
'Ġscoop' : 1.1742193698883057
'Ġtext' : 1.2469427585601807
'Ġon' : 1.2686882019042969
'Ġthe' : 1.2753552198410034
'Ġand' : 1.286698818206787
"""
https://github.com/huggingface/transformers/issues/1458
"""
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch import dot, abs
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2') # or any other checkpoint
word_embeddings = model.transformer.wte.weight # Word Token Embeddings
vocab: dict = tokenizer.get_vocab()
reverse_vocab = { v: k for k, v in vocab.items() }
def most_orthogonal(token: str):
token_of_interest = vocab[token]
embedding_of_interest = word_embeddings[token_of_interest]
dot_products = (
(reverse_vocab[token], abs(dot(embedding_of_interest, word_embeddings[token])))
for token
in vocab.values()
if token != token_of_interest
)
sorted_dproducts = sorted(dot_products, key=lambda x: x[1])
return sorted_dproducts
for token in ["MIT", "mit",]:
print("===", token, "===",)
for token, abs_dot in most_orthogonal(token)[:5]:
print(repr(token), ":", abs_dot.item())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment