Last active
December 14, 2023 01:13
-
-
Save Frank-Buss/173fab1fc935b6d184b4b8454c254b06 to your computer and use it in GitHub Desktop.
Sample script how to run the uncensored WizardLM LLM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Test script for the uncensored WizardLM model: | |
# https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML | |
# | |
# Tested on Mac Mini M1, with 16 GB RAM. Needs some libraries: | |
# | |
# pip install torch transformers accelerate bitsandbytes | |
# CT_METAL=1 pip install ctransformers --no-binary ctransformers | |
# | |
# On CUDA systems, use this to install ctransformers: | |
# pip install ctransformers[cuda] | |
# | |
# It generates about 10 tokens per second on a Mac. | |
# On Windows with a RTX3080, about 20 tokens per second. | |
from ctransformers import AutoModelForCausalLM | |
# initialize model | |
llm = AutoModelForCausalLM.from_pretrained( | |
"TheBloke/WizardLM-7B-uncensored-GGUF", | |
model_file="WizardLM-7B-uncensored.Q4_K_M.gguf", | |
model_type="llama", | |
gpu_layers=50, | |
max_new_tokens = 1000, | |
context_length = 6000) | |
def generate_and_print(llm, tokens): | |
try: | |
for token in llm.generate(tokens): | |
print(llm.detokenize(token), end='', flush=True) | |
except KeyboardInterrupt: | |
print("\nOutput interrupted by user. Enter a new prompt.") | |
return | |
while True: | |
try: | |
user_input = input("\nEnter your prompt (ctrl-c to exit) : ") | |
if user_input.lower() == 'exit': | |
break | |
tokens = llm.tokenize(user_input) | |
generate_and_print(llm, tokens) | |
except KeyboardInterrupt: | |
print("\nProgram interrupted by user. Exiting...") | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment