silphendio · February 15, 2024 23:55
diff --git a/exl_slice_test.py b/exl_slice_test.py
 # to use this, first install python and exllamav2 (https://github.com/turboderp/exllamav2)
 # load a model, rearrange the layers as you like, set generation parameters, and run it
 # duplicate layers share tensors, but still need extra memory for the cache
 # thanks to  @dnhkng for showing that the cache needs to be re-created
 # licensed under WTFPL (http://www.wtfpl.net/about/) - Silphendio


 from exllamav2 import *
 from exllamav2.generator import *
 import sys, torch
 from copy import copy

 config = ExLlamaV2Config()
 config.model_dir = "./TinyLlama-1.1B-Chat-v1.0-5.0bpw-h6-exl2"
 config.prepare()

 model = ExLlamaV2(config)
 cache = ExLlamaV2Cache_8bit(model, lazy = True)
 print("Loading model...")
 model.load_autosplit(cache)

 tokenizer = ExLlamaV2Tokenizer(config)
 gen_settings = ExLlamaV2Sampler.Settings()


 ## mix layers here
 layer_arrangement = list(range(0,14)) + list(range(8,22))

 # modules arangement: [embedding, [...layers], rms-norm, head]
 # where each layer is [attention, mlp]
 old_modules = model.modules
 model.modules = old_modules[:1]
 for i, idx in enumerate(layer_arrangement):
    model.modules += [copy(old_modules[idx*2 + 1])]
    model.modules[-1].layer_idx = i # for duplicate layers to use a different cache
    model.modules += [old_modules[idx*2 + 2]]
 model.modules += old_modules[-2:]
 model.head_layer_idx = len(model.modules) -1
 model.config.num_hidden_layers = len(layer_arrangement)
 model.last_kv_layer_idx = len(model.modules) -4

 print('Re-creating cache')
 del cache
 model.cache_map = {}
 model.set_cache_map()
 cache = ExLlamaV2Cache_8bit(model)

 # this needs to be re-created after rearranging layers
 generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
 generator.set_stop_conditions([tokenizer.eos_token_id])
 ## mix layers end

 # adjust generation settings
 gen_settings.temperature = 0.0 # for deterministic results
 #gen_settings.top_k = 50
 #gen_settings.top_p = 0.8
 #gen_settings.min_p = 0

 max_response_length = 512

 print("starting generation")
 text = """<|system|>
 You are a chatbot who can help code!</s>
 <|user|>
 Write me a python script to blink an LED on a raspberry PI.</s>
 <|assistant|>"""
 print("\n" + text, end="")

 instruction_ids = tokenizer.encode(text, add_bos = True)
 context_ids = instruction_ids if generator.sequence_ids is None \
    else torch.cat([generator.sequence_ids, instruction_ids], dim = -1)

 generator.begin_stream(context_ids, gen_settings)

 for _ in range(max_response_length):
    chunk, eos, _ = generator.stream()
    if eos: break
    text += chunk
    if text.endswith("<|user|>"):
        break
    print(chunk, end = "")
    sys.stdout.flush()
 text += "\n"

 # cleanup
 model.modules = old_modules
 model.unload()
 del cache
	# to use this, first install python and exllamav2 (https://github.com/turboderp/exllamav2)
	# load a model, rearrange the layers as you like, set generation parameters, and run it
	# duplicate layers share tensors, but still need extra memory for the cache
	# thanks to @dnhkng for showing that the cache needs to be re-created
	# licensed under WTFPL (http://www.wtfpl.net/about/) - Silphendio


	from exllamav2 import *
	from exllamav2.generator import *
	import sys, torch
	from copy import copy

	config = ExLlamaV2Config()
	config.model_dir = "./TinyLlama-1.1B-Chat-v1.0-5.0bpw-h6-exl2"
	config.prepare()

	model = ExLlamaV2(config)
	cache = ExLlamaV2Cache_8bit(model, lazy = True)
	print("Loading model...")
	model.load_autosplit(cache)

	tokenizer = ExLlamaV2Tokenizer(config)
	gen_settings = ExLlamaV2Sampler.Settings()


	## mix layers here
	layer_arrangement = list(range(0,14)) + list(range(8,22))

	# modules arangement: [embedding, [...layers], rms-norm, head]
	# where each layer is [attention, mlp]
	old_modules = model.modules
	model.modules = old_modules[:1]
	for i, idx in enumerate(layer_arrangement):
	model.modules += [copy(old_modules[idx*2 + 1])]
	model.modules[-1].layer_idx = i # for duplicate layers to use a different cache
	model.modules += [old_modules[idx*2 + 2]]
	model.modules += old_modules[-2:]
	model.head_layer_idx = len(model.modules) -1
	model.config.num_hidden_layers = len(layer_arrangement)
	model.last_kv_layer_idx = len(model.modules) -4

	print('Re-creating cache')
	del cache
	model.cache_map = {}
	model.set_cache_map()
	cache = ExLlamaV2Cache_8bit(model)

	# this needs to be re-created after rearranging layers
	generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
	generator.set_stop_conditions([tokenizer.eos_token_id])
	## mix layers end

	# adjust generation settings
	gen_settings.temperature = 0.0 # for deterministic results
	#gen_settings.top_k = 50
	#gen_settings.top_p = 0.8
	#gen_settings.min_p = 0

	max_response_length = 512

	print("starting generation")
	text = """<\|system\|>
	You are a chatbot who can help code!</s>
	<\|user\|>
	Write me a python script to blink an LED on a raspberry PI.</s>
	<\|assistant\|>"""
	print("\n" + text, end="")

	instruction_ids = tokenizer.encode(text, add_bos = True)
	context_ids = instruction_ids if generator.sequence_ids is None \
	else torch.cat([generator.sequence_ids, instruction_ids], dim = -1)

	generator.begin_stream(context_ids, gen_settings)

	for _ in range(max_response_length):
	chunk, eos, _ = generator.stream()
	if eos: break
	text += chunk
	if text.endswith("<\|user\|>"):
	break
	print(chunk, end = "")
	sys.stdout.flush()
	text += "\n"

	# cleanup
	model.modules = old_modules
	model.unload()
	del cache