Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created June 1, 2023 11:25
Show Gist options
  • Save pashu123/fc3feaa216a45e3048c76fb691f41c87 to your computer and use it in GitHub Desktop.
Save pashu123/fc3feaa216a45e3048c76fb691f41c87 to your computer and use it in GitHub Desktop.
from iree import runtime as ireert
from iree.compiler import compile_str
import numpy as np
import os
with open(os.path.join("vicuna_fp32_cpu.vmfb"), "rb") as mlir_file:
flatbuffer_blob = mlir_file.read()
backend = "llvm-cpu"
args = ["--iree-llvmcpu-target-cpu-features=host"]
backend_config = "local-task"
# flatbuffer_blob = compile_str(bytecode, target_backends=[backend], extra_args=args)
config = ireert.Config("local-sync")
vm_module = ireert.VmModule.from_flatbuffer(config.vm_instance, flatbuffer_blob)
ctx = ireert.SystemContext(config=config)
ctx.add_vm_module(vm_module)
complex_compiled = ctx.modules.module
input1 = np.load("inp1.npy")
input2 = np.load("inp2.npy")
x = complex_compiled.forward(input1, input2)
print(x.to_host())
import torch
import torch_mlir
from shark.shark_importer import import_with_fx
import os
import sys
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
StoppingCriteria,
StoppingCriteriaList,
)
def compile_via_shark(model, inputs):
# input_mask = [False, False]
# bytecode = import_with_fx(model, inputs)
# with open(os.path.join("vicuna_fp32.mlir"), "wb") as mlir_file:
# mlir_file.write(bytecode[0])
from shark.shark_inference import SharkInference
shark_module = SharkInference(
mlir_module="", device="cpu", mlir_dialect="tm_tensor",
)
# extra_args = ['--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))', '--iree-spirv-index-bits=64']
# shark_module.save_module(module_name="vicuna_fp32_cuda")
shark_module.load_module(path="vicuna_fp32_cpu.vmfb")
# shark_module.compile(extra_args=[])
return shark_module
tokenizer = AutoTokenizer.from_pretrained(
"TheBloke/vicuna-7B-1.1-HF",
use_fast=False
)
class StopOnTokens(StoppingCriteria):
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
) -> bool:
stop_ids = [50278, 50279, 50277, 1, 0]
for stop_id in stop_ids:
if input_ids[0][-1] == stop_id:
return True
return False
system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""
prompt = f"{system_prompt}<|USER|>What's your mood today?<|ASSISTANT|>"
inputs = tokenizer(prompt, return_tensors="pt")
inputs_model = (inputs["input_ids"], inputs["attention_mask"])
# print(inputs_model[0].shape)
# print(inputs_model[1].shape)
# import numpy as np
# np.save("inp1.npy",inputs_model[0].numpy())
# np.save("inp2.npy",inputs_model[1].numpy())
# import sys
# sys.exit()
# class SLM(torch.nn.Module):
# def __init__(self):
# super().__init__()
# self.model = AutoModelForCausalLM.from_pretrained(
# "TheBloke/vicuna-7B-1.1-HF"
# )
# def forward(self, input_ids, attention_mask):
# return self.model(input_ids, attention_mask)[0]
# slm_model = SLM()
# res_pytorch = slm_model(inputs_model[0], inputs_model[1])
shark_unet = compile_via_shark("", inputs_model)
# output_torch = slm_model(inputs_model[0], inputs_model[1])
# print(output_torch)
output_shark = shark_unet("forward", (inputs_model[0].numpy(), inputs_model[1].numpy()))
print(output_shark)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment