Frank-Buss · December 14, 2023 01:13
diff --git a/WizardLM.py b/WizardLM.py
 #!/usr/bin/env python3
 #
 # Test script for the uncensored WizardLM model:
 # https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML
 # 
 # Tested on Mac Mini M1, with 16 GB RAM. Needs some libraries:
 #
 # pip install torch transformers accelerate bitsandbytes
 # CT_METAL=1 pip install ctransformers --no-binary ctransformers
 #
 # On CUDA systems, use this to install ctransformers:
 # pip install ctransformers[cuda]
 #
 # It generates about 10 tokens per second on a Mac.
 # On Windows with a RTX3080, about 20 tokens per second.

 from ctransformers import AutoModelForCausalLM

 # initialize model
 llm = AutoModelForCausalLM.from_pretrained(
        "TheBloke/WizardLM-7B-uncensored-GGUF", 
        model_file="WizardLM-7B-uncensored.Q4_K_M.gguf", 
        model_type="llama", 
        gpu_layers=50,
        max_new_tokens = 1000,
        context_length = 6000)

 def generate_and_print(llm, tokens):
    try:
        for token in llm.generate(tokens):
            print(llm.detokenize(token), end='', flush=True)
    except KeyboardInterrupt:
        print("\nOutput interrupted by user. Enter a new prompt.")
        return

 while True:
    try:
        user_input = input("\nEnter your prompt (ctrl-c to exit) : ")
        if user_input.lower() == 'exit':
            break

        tokens = llm.tokenize(user_input)
        generate_and_print(llm, tokens)
    except KeyboardInterrupt:
        print("\nProgram interrupted by user. Exiting...")
        break
	#!/usr/bin/env python3
	#
	# Test script for the uncensored WizardLM model:
	# https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML
	#
	# Tested on Mac Mini M1, with 16 GB RAM. Needs some libraries:
	#
	# pip install torch transformers accelerate bitsandbytes
	# CT_METAL=1 pip install ctransformers --no-binary ctransformers
	#
	# On CUDA systems, use this to install ctransformers:
	# pip install ctransformers[cuda]
	#
	# It generates about 10 tokens per second on a Mac.
	# On Windows with a RTX3080, about 20 tokens per second.

	from ctransformers import AutoModelForCausalLM

	# initialize model
	llm = AutoModelForCausalLM.from_pretrained(
	"TheBloke/WizardLM-7B-uncensored-GGUF",
	model_file="WizardLM-7B-uncensored.Q4_K_M.gguf",
	model_type="llama",
	gpu_layers=50,
	max_new_tokens = 1000,
	context_length = 6000)

	def generate_and_print(llm, tokens):
	try:
	for token in llm.generate(tokens):
	print(llm.detokenize(token), end='', flush=True)
	except KeyboardInterrupt:
	print("\nOutput interrupted by user. Enter a new prompt.")
	return

	while True:
	try:
	user_input = input("\nEnter your prompt (ctrl-c to exit) : ")
	if user_input.lower() == 'exit':
	break

	tokens = llm.tokenize(user_input)
	generate_and_print(llm, tokens)
	except KeyboardInterrupt:
	print("\nProgram interrupted by user. Exiting...")
	break