Skip to content

Instantly share code, notes, and snippets.

@yuchenlin
Last active June 14, 2024 17:57
Show Gist options
  • Save yuchenlin/eb63e2d0513f70cfc9bb85fa5a78953b to your computer and use it in GitHub Desktop.
Save yuchenlin/eb63e2d0513f70cfc9bb85fa5a78953b to your computer and use it in GitHub Desktop.
Compute sentence probability using GPT-2 with huggingface transformers
import torch
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from scipy.special import softmax
def model_init(model_string, cuda):
if model_string.startswith("gpt2"):
tokenizer = GPT2Tokenizer.from_pretrained(model_string)
model = GPT2LMHeadModel.from_pretrained(model_string)
else:
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_string)
model = OpenAIGPTLMHeadModel.from_pretrained(model_string)
model.eval()
if cuda:
model.to('cuda')
print("Model init")
return model, tokenizer
def sent_scoring(model_tokenizer, text, cuda):
model = model_tokenizer[0]
tokenizer = model_tokenizer[1]
assert model is not None
assert tokenizer is not None
input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0) # Batch size 1
if cuda:
input_ids = input_ids.to('cuda')
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
sentence_prob = loss.item()
return sentence_prob
if __name__ == '__main__':
# model, tokenizer = model_init('openai-gpt', False)
model, tokenizer = model_init('gpt2', False)
print(sent_scoring((model, tokenizer), "I love my cute dog.", False))
print(sent_scoring((model, tokenizer), "I love your stupid dog.", False))
@ajesujoba
Copy link

Hi, do you think it is possible to have each word's probability instead of just the whole sentence's probability?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment