pashu123 · June 1, 2023 11:25
diff --git a/test_cpu.py b/test_cpu.py
 from iree import runtime as ireert
 from iree.compiler import compile_str

 import numpy as np
 import os

 with open(os.path.join("vicuna_fp32_cpu.vmfb"), "rb") as mlir_file:
    flatbuffer_blob = mlir_file.read()


 backend = "llvm-cpu"
 args = ["--iree-llvmcpu-target-cpu-features=host"]
 backend_config = "local-task"
 # flatbuffer_blob = compile_str(bytecode, target_backends=[backend], extra_args=args)

 config = ireert.Config("local-sync")
 vm_module = ireert.VmModule.from_flatbuffer(config.vm_instance, flatbuffer_blob)
 ctx = ireert.SystemContext(config=config)
 ctx.add_vm_module(vm_module)
 complex_compiled = ctx.modules.module
 input1 = np.load("inp1.npy")
 input2 = np.load("inp2.npy")
 x = complex_compiled.forward(input1, input2)

 print(x.to_host())
diff --git a/test_fp16.py b/test_fp16.py
 import torch
 import torch_mlir
 from shark.shark_importer import import_with_fx
 import os
 import sys
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
 )



 def compile_via_shark(model, inputs):
    # input_mask = [False, False]
    # bytecode = import_with_fx(model, inputs) 
    # with open(os.path.join("vicuna_fp32.mlir"), "wb") as mlir_file:
        # mlir_file.write(bytecode[0])


    from shark.shark_inference import SharkInference
    shark_module = SharkInference(
        mlir_module="", device="cpu", mlir_dialect="tm_tensor",
    )
    # extra_args = ['--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))', '--iree-spirv-index-bits=64']
    # shark_module.save_module(module_name="vicuna_fp32_cuda")

    shark_module.load_module(path="vicuna_fp32_cpu.vmfb")
    # shark_module.compile(extra_args=[])
    return shark_module


 tokenizer = AutoTokenizer.from_pretrained(
    "TheBloke/vicuna-7B-1.1-HF",
    use_fast=False
 )

 class StopOnTokens(StoppingCriteria):
    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        stop_ids = [50278, 50279, 50277, 1, 0]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False


 system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
 - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
 - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
 - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
 - StableLM will refuse to participate in anything that could harm a human.
 """

 prompt = f"{system_prompt}<|USER|>What's your mood today?<|ASSISTANT|>"

 inputs = tokenizer(prompt, return_tensors="pt")

 inputs_model = (inputs["input_ids"], inputs["attention_mask"])

 # print(inputs_model[0].shape)
 # print(inputs_model[1].shape)

 # import numpy as np
 # np.save("inp1.npy",inputs_model[0].numpy())
 # np.save("inp2.npy",inputs_model[1].numpy())

 # import sys
 # sys.exit()

 # class SLM(torch.nn.Module):
    # def __init__(self):
        # super().__init__()
        # self.model = AutoModelForCausalLM.from_pretrained(
            # "TheBloke/vicuna-7B-1.1-HF"
        # )

    # def forward(self, input_ids, attention_mask):
        # return self.model(input_ids, attention_mask)[0]


 # slm_model = SLM()

 # res_pytorch = slm_model(inputs_model[0], inputs_model[1])
 shark_unet = compile_via_shark("", inputs_model)


 # output_torch = slm_model(inputs_model[0], inputs_model[1])
 # print(output_torch)

 output_shark = shark_unet("forward", (inputs_model[0].numpy(), inputs_model[1].numpy()))
 print(output_shark)
	from iree import runtime as ireert
	from iree.compiler import compile_str

	import numpy as np
	import os

	with open(os.path.join("vicuna_fp32_cpu.vmfb"), "rb") as mlir_file:
	flatbuffer_blob = mlir_file.read()


	backend = "llvm-cpu"
	args = ["--iree-llvmcpu-target-cpu-features=host"]
	backend_config = "local-task"
	# flatbuffer_blob = compile_str(bytecode, target_backends=[backend], extra_args=args)

	config = ireert.Config("local-sync")
	vm_module = ireert.VmModule.from_flatbuffer(config.vm_instance, flatbuffer_blob)
	ctx = ireert.SystemContext(config=config)
	ctx.add_vm_module(vm_module)
	complex_compiled = ctx.modules.module
	input1 = np.load("inp1.npy")
	input2 = np.load("inp2.npy")
	x = complex_compiled.forward(input1, input2)

	print(x.to_host())
	import torch
	import torch_mlir
	from shark.shark_importer import import_with_fx
	import os
	import sys
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	StoppingCriteria,
	StoppingCriteriaList,
	)



	def compile_via_shark(model, inputs):
	# input_mask = [False, False]
	# bytecode = import_with_fx(model, inputs)
	# with open(os.path.join("vicuna_fp32.mlir"), "wb") as mlir_file:
	# mlir_file.write(bytecode[0])


	from shark.shark_inference import SharkInference
	shark_module = SharkInference(
	mlir_module="", device="cpu", mlir_dialect="tm_tensor",
	)
	# extra_args = ['--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))', '--iree-spirv-index-bits=64']
	# shark_module.save_module(module_name="vicuna_fp32_cuda")

	shark_module.load_module(path="vicuna_fp32_cpu.vmfb")
	# shark_module.compile(extra_args=[])
	return shark_module


	tokenizer = AutoTokenizer.from_pretrained(
	"TheBloke/vicuna-7B-1.1-HF",
	use_fast=False
	)

	class StopOnTokens(StoppingCriteria):
	def __call__(
	self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
	) -> bool:
	stop_ids = [50278, 50279, 50277, 1, 0]
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False


	system_prompt = """<\|SYSTEM\|># StableLM Tuned (Alpha version)
	- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
	- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
	- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
	- StableLM will refuse to participate in anything that could harm a human.
	"""

	prompt = f"{system_prompt}<\|USER\|>What's your mood today?<\|ASSISTANT\|>"

	inputs = tokenizer(prompt, return_tensors="pt")

	inputs_model = (inputs["input_ids"], inputs["attention_mask"])

	# print(inputs_model[0].shape)
	# print(inputs_model[1].shape)

	# import numpy as np
	# np.save("inp1.npy",inputs_model[0].numpy())
	# np.save("inp2.npy",inputs_model[1].numpy())

	# import sys
	# sys.exit()

	# class SLM(torch.nn.Module):
	# def __init__(self):
	# super().__init__()
	# self.model = AutoModelForCausalLM.from_pretrained(
	# "TheBloke/vicuna-7B-1.1-HF"
	# )

	# def forward(self, input_ids, attention_mask):
	# return self.model(input_ids, attention_mask)[0]


	# slm_model = SLM()

	# res_pytorch = slm_model(inputs_model[0], inputs_model[1])
	shark_unet = compile_via_shark("", inputs_model)


	# output_torch = slm_model(inputs_model[0], inputs_model[1])
	# print(output_torch)

	output_shark = shark_unet("forward", (inputs_model[0].numpy(), inputs_model[1].numpy()))
	print(output_shark)