Created
April 25, 2023 03:43
-
-
Save SuperSonicHub1/6a757f9ec3c5722699e0584650d8db23 to your computer and use it in GitHub Desktop.
What's is the least related thing to MIT?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
=== MIT === | |
'Ġthe' : 0.27957552671432495 | |
'Ġand' : 0.324958860874176 | |
'Ġin' : 0.3969202935695648 | |
'Ġa' : 0.6269370913505554 | |
'Ġ(' : 0.6961137056350708 | |
=== mit === | |
'Ġscoop' : 1.1742193698883057 | |
'Ġtext' : 1.2469427585601807 | |
'Ġon' : 1.2686882019042969 | |
'Ġthe' : 1.2753552198410034 | |
'Ġand' : 1.286698818206787 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
https://github.com/huggingface/transformers/issues/1458 | |
""" | |
from transformers import GPT2LMHeadModel, GPT2Tokenizer | |
from torch import dot, abs | |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
model = GPT2LMHeadModel.from_pretrained('gpt2') # or any other checkpoint | |
word_embeddings = model.transformer.wte.weight # Word Token Embeddings | |
vocab: dict = tokenizer.get_vocab() | |
reverse_vocab = { v: k for k, v in vocab.items() } | |
def most_orthogonal(token: str): | |
token_of_interest = vocab[token] | |
embedding_of_interest = word_embeddings[token_of_interest] | |
dot_products = ( | |
(reverse_vocab[token], abs(dot(embedding_of_interest, word_embeddings[token]))) | |
for token | |
in vocab.values() | |
if token != token_of_interest | |
) | |
sorted_dproducts = sorted(dot_products, key=lambda x: x[1]) | |
return sorted_dproducts | |
for token in ["MIT", "mit",]: | |
print("===", token, "===",) | |
for token, abs_dot in most_orthogonal(token)[:5]: | |
print(repr(token), ":", abs_dot.item()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment