Code is from Deep Learning AI course How Transformer LLMs Work
conda create -n llm python=3.12
conda activate llm
conda install conda-forge::transformers
conda install conda-forge::jupyterlab
conda install pytorch::pytorch
import transformers
from transformers import AutoTokenizer
sentence = "Hello world!"
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
ids_info= tokenizer(sentence)
ids= ids_info.input_ids
for id1 in ids:
print(id1,tokenizer.decode(id1))
The output is
101 [CLS]
8667 Hello
1362 world
106 !
102 [SEP]
We could use different models like
"gpt2", "google/flan-t5-small", "bigcode/starcoder2-15b", "Xenova/gpt-4", "microsoft/Phi-3-mini-4k-instruct", "Qwen/Qwen2-VL-7B-Instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoTokenizer
import accelerate
import torch
CPU_cores=10
torch.set_num_threads(CPU_cores)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct",
device_map="cpu",torch_dtype="auto",trust_remote_code=True)
This downloads ~6GB of models
# Create a pipeline
generator = pipeline(
"text-generation",
model=model,tokenizer=tokenizer,
return_full_text=False, max_new_tokens=50, do_sample=False, use_cache=False )
I've added use_cache=False
, otherwise it gives an error of DynamicCache object has no attribute get_max_length
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened. "
output = generator(prompt)
print(output[0]['generated_text'])
This can takes ~5 minutes using 30 CPUs. The output is
Email to Sarah:
Subject: Sincere Apologies for the Gardening Mishap
Dear Sarah,
I hope this message finds you well. I am writing to express my deepest ap
I guess we limit it ?
prompt = "The capital of France is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
model_output = model.model(input_ids)
print(model_output[0].shape)
lm_head_output = model.lm_head(model_output[0])
print(lm_head_output.shape)
token_id = lm_head_output[0,-1].argmax(-1)
print("token_id is ", token_id)
tokenizer.decode(token_id)
The output is
torch.Size([1, 5, 3072])
torch.Size([1, 5, 32064])
token_id is tensor(3681)
'Paris'