Skip to content

Instantly share code, notes, and snippets.

@shivam-51
Last active October 4, 2023 09:44
Show Gist options
  • Save shivam-51/6e4acb1fe8cd8850a806346474275858 to your computer and use it in GitHub Desktop.
Save shivam-51/6e4acb1fe8cd8850a806346474275858 to your computer and use it in GitHub Desktop.
Using llama 13b model from huggingface
from torch import cuda, bfloat16
from langchain.llms import HuggingFacePipeline
import transformers
model_id = 'meta-llama/Llama-2-13b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16
)
# begin initializing HF items, need auth token for these
hf_auth = 'your_hugging_face_token'
model_config = transformers.AutoConfig.from_pretrained(
model_id,
use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=bnb_config,
device_map='auto',
use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")
generate_text = transformers.pipeline(
model=model, tokenizer=tokenizer,
return_full_text=True, # langchain expects the full text
task='text-generation',
# we pass model parameters here too
temperature=0.0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
max_new_tokens=512, # mex number of tokens to generate in the output
repetition_penalty=1.1 # without this output begins repeating
)
llm = HuggingFacePipeline(pipeline=generate_text)
llm(prompt="What is the capital of India?")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment