AmosLewis · September 26, 2022 20:03
diff --git a/bloom_fp16.py b/bloom_fp16.py
 import torch
 from transformers import AutoModelForSequenceClassification

 class HuggingFaceLanguage(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "bigscience/bloom-560m",  # The pretrained model.
            num_labels=2,  # The number of output labels--2 for binary classification.
            output_attentions=False,  # Whether the model returns attentions weights.
            output_hidden_states=False,  # Whether the model returns all hidden-states.
            torchscript=True,
        )

    def forward(self, tokens):
        return self.model.forward(tokens)[0]

 torch.manual_seed(0)

 model = HuggingFaceLanguage()
 test_input = torch.randint(2, (1, 128))
 actual_out = model(test_input)


 from torch.fx.experimental.proxy_tensor import make_fx
 from torch._decomp import get_decompositions
 fx_g = make_fx(
        model,
        decomposition_table=get_decompositions(
            [
                torch.ops.aten.split.Tensor,
                torch.ops.aten.split_with_sizes,
            ]
        ),
    )(
            test_input
    )

 # print(fx_g.graph)

 fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
 fx_g.recompile()

 def strip_overloads(gm):
    """
    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
    Args:
        gm(fx.GraphModule): The input Fx graph module to be modified
    """
    for node in gm.graph.nodes:
        if isinstance(node.target, torch._ops.OpOverload):
            node.target = node.target.overloadpacket
    gm.recompile()

 strip_overloads(fx_g)

 # checkout if the mode symbolic tracelable
 # https://pytorch.org/docs/stable/fx.html#module-torch.fx
 from torch.fx import symbolic_trace
 # Symbolic tracing frontend - captures the semantics of the module
 symbolic_traced : torch.fx.GraphModule = symbolic_trace(fx_g)
 # High-level intermediate representation (IR) - Graph representation
 print(symbolic_traced.graph) # print graph successfully, yes it is symbolic traceable


 # QUANTIZATION SETUP
 model = fx_g

 # # test1: static quantization  SUCCESS
 # # golden_out VS shark_out
 # # tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[  7.2043, -17.0265]])
 # model.eval()
 # model.qconfig = torch.quantization.qconfig.float16_static_qconfig
 #
 # # Prepare the model for static quantization. This inserts observers in
 # # the model that will observe activation tensors during calibration.
 # model_fp32_prepared = torch.quantization.prepare(model)
 #
 # # calibrate the prepared model to determine quantization parameters for activations
 # # in a real world setting, the calibration would be done with a representative dataset
 # model_fp32_prepared(test_input)
 #
 # # Convert the observed model to a quantized model. This does several things:
 # # quantizes the weights, computes and stores the scale and bias value to be
 # # used with each activation tensor, and replaces key operators with quantized
 # # implementations.
 # model_fp16 = torch.quantization.convert(model_fp32_prepared)
 #
 # # run the model, relevant calculations will happen in fp16
 # res = model_fp16(test_input)
 # print("res: ", res)
 # fx_g = model_fp16

 # print("model_fp16: ", model_fp16.graph)



 # # test2: dynamic quantization  SUCCESS
 # # golden_out VS shark_out
 # # tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[  7.2043, -17.0265]])
 # model.eval()
 # model_fp16 = torch.quantization.quantize_dynamic(
 #     model,  # the original model
 #     {torch.nn.Linear},  # a set of layers to dynamically quantize
 #     dtype=torch.float16)  # the target dtype for quantized weights
 # # res:  tensor([[  7.2050, -17.0270]])

 # # run the model, relevant calculations will happen in fp16
 # res = model_fp16(test_input)
 # print("res: ", res)
 # fx_g = model_fp16



 # test3:quantize_fx SUCCESS, But shark_out vary in different run
 # golden_out VS shark_out
 # tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-5.0346, -3.0476]])
 # tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-8.9322, -2.2289]])
 # tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-7.9967, -6.8403]])
 import torch.quantization.quantize_fx as quantize_fx
 # import copy
 #
 # model_to_quantize = copy.deepcopy(model)
 # code from https://github.com/pytorch/pytorch/blob/master/test/quantization/fx/test_quantize_fx.py#:~:text=qconfig_mapping%20%3D%20get_default_qconfig_mapping().set_global(float16_static_qconfig)
 qconfig_dict = {"": torch.quantization.qconfig.float16_static_qconfig}
 qconfig_mapping_ = torch.ao.quantization.get_default_qconfig_mapping().set_global(torch.quantization.qconfig.float16_static_qconfig)
 model.eval()
 # prepare
 test_input.to(dtype=torch.float16)
 model_prepared = quantize_fx.prepare_fx(model, qconfig_mapping=qconfig_mapping_, example_inputs=(test_input,))
 # calibrate (not shown)
 # quantize
 model_fp16 = quantize_fx.convert_fx(model_prepared)

 # run the model, relevant calculations will happen in fp16
 res = model_fp16(test_input)
 print("res.dtype: ", res.dtype) # res.dtype:  torch.float32
 fx_g = model_fp16
 # print("model_fp16: ", model_fp16.graph)
 # print("########model_fp16: ", model_fp16)
 # model_fp16.print_readable()
 # print("model_fp16: ", model_fp16.code)

 # Maybe try to print the weight with fp16_repr() ???? 
 # https://discuss.pytorch.org/t/how-to-print-the-weight-after-torch-quantization-convert/161801
 # model_fp16.layer[0].weight().int_repr().data


 ts_g = torch.jit.script(fx_g)
 # print("ts_g: ", ts_g.graph)

 # import torch_mlir
 # module = torch_mlir.compile(
 #     ts_g, [test_input], torch_mlir.OutputType.LINALG_ON_TENSORS, use_tracing=True, verbose=True
 # )
 # # import pdb
 # # pdb.set_trace()
 # # module.dump()

 # from shark.shark_inference import SharkInference


 # mlir_model = module
 # func_name = "forward"

 # shark_module = SharkInference(
 #     mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
 # )
 # shark_module.compile()

 # def shark_result(x):
 #     x_ny = x.detach().numpy()
 #     inputs = (x_ny,)
 #     result = shark_module.forward(inputs)
 #     return torch.from_numpy(result)

 # observed_out = shark_result(test_input)

 # print(actual_out, observed_out)
 # tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[3.5580e-33, 0.0000e+00]])
 # QUANTIZATION 
 # test1: static quantization:  
 #       tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[  7.2043, -17.0265]])
 # test2: dynamic quantization:
 #       tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[  7.2043, -17.0265]])
 # test3:quantize_fx : 
 #       tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-5.0346, -3.0476]])
 #       tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-8.9322, -2.2289]])
 #       tensor([[  7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-7.9967, -6.8403]])
	import torch
	from transformers import AutoModelForSequenceClassification

	class HuggingFaceLanguage(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.model = AutoModelForSequenceClassification.from_pretrained(
	"bigscience/bloom-560m", # The pretrained model.
	num_labels=2, # The number of output labels--2 for binary classification.
	output_attentions=False, # Whether the model returns attentions weights.
	output_hidden_states=False, # Whether the model returns all hidden-states.
	torchscript=True,
	)

	def forward(self, tokens):
	return self.model.forward(tokens)[0]

	torch.manual_seed(0)

	model = HuggingFaceLanguage()
	test_input = torch.randint(2, (1, 128))
	actual_out = model(test_input)


	from torch.fx.experimental.proxy_tensor import make_fx
	from torch._decomp import get_decompositions
	fx_g = make_fx(
	model,
	decomposition_table=get_decompositions(
	[
	torch.ops.aten.split.Tensor,
	torch.ops.aten.split_with_sizes,
	]
	),
	)(
	test_input
	)

	# print(fx_g.graph)

	fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
	fx_g.recompile()

	def strip_overloads(gm):
	"""
	Modifies the target of graph nodes in :attr:`gm` to strip overloads.
	Args:
	gm(fx.GraphModule): The input Fx graph module to be modified
	"""
	for node in gm.graph.nodes:
	if isinstance(node.target, torch._ops.OpOverload):
	node.target = node.target.overloadpacket
	gm.recompile()

	strip_overloads(fx_g)

	# checkout if the mode symbolic tracelable
	# https://pytorch.org/docs/stable/fx.html#module-torch.fx
	from torch.fx import symbolic_trace
	# Symbolic tracing frontend - captures the semantics of the module
	symbolic_traced : torch.fx.GraphModule = symbolic_trace(fx_g)
	# High-level intermediate representation (IR) - Graph representation
	print(symbolic_traced.graph) # print graph successfully, yes it is symbolic traceable


	# QUANTIZATION SETUP
	model = fx_g

	# # test1: static quantization SUCCESS
	# # golden_out VS shark_out
	# # tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[ 7.2043, -17.0265]])
	# model.eval()
	# model.qconfig = torch.quantization.qconfig.float16_static_qconfig
	#
	# # Prepare the model for static quantization. This inserts observers in
	# # the model that will observe activation tensors during calibration.
	# model_fp32_prepared = torch.quantization.prepare(model)
	#
	# # calibrate the prepared model to determine quantization parameters for activations
	# # in a real world setting, the calibration would be done with a representative dataset
	# model_fp32_prepared(test_input)
	#
	# # Convert the observed model to a quantized model. This does several things:
	# # quantizes the weights, computes and stores the scale and bias value to be
	# # used with each activation tensor, and replaces key operators with quantized
	# # implementations.
	# model_fp16 = torch.quantization.convert(model_fp32_prepared)
	#
	# # run the model, relevant calculations will happen in fp16
	# res = model_fp16(test_input)
	# print("res: ", res)
	# fx_g = model_fp16

	# print("model_fp16: ", model_fp16.graph)



	# # test2: dynamic quantization SUCCESS
	# # golden_out VS shark_out
	# # tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[ 7.2043, -17.0265]])
	# model.eval()
	# model_fp16 = torch.quantization.quantize_dynamic(
	# model, # the original model
	# {torch.nn.Linear}, # a set of layers to dynamically quantize
	# dtype=torch.float16) # the target dtype for quantized weights
	# # res: tensor([[ 7.2050, -17.0270]])

	# # run the model, relevant calculations will happen in fp16
	# res = model_fp16(test_input)
	# print("res: ", res)
	# fx_g = model_fp16



	# test3:quantize_fx SUCCESS, But shark_out vary in different run
	# golden_out VS shark_out
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-5.0346, -3.0476]])
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-8.9322, -2.2289]])
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-7.9967, -6.8403]])
	import torch.quantization.quantize_fx as quantize_fx
	# import copy
	#
	# model_to_quantize = copy.deepcopy(model)
	# code from https://github.com/pytorch/pytorch/blob/master/test/quantization/fx/test_quantize_fx.py#:~:text=qconfig_mapping%20%3D%20get_default_qconfig_mapping().set_global(float16_static_qconfig)
	qconfig_dict = {"": torch.quantization.qconfig.float16_static_qconfig}
	qconfig_mapping_ = torch.ao.quantization.get_default_qconfig_mapping().set_global(torch.quantization.qconfig.float16_static_qconfig)
	model.eval()
	# prepare
	test_input.to(dtype=torch.float16)
	model_prepared = quantize_fx.prepare_fx(model, qconfig_mapping=qconfig_mapping_, example_inputs=(test_input,))
	# calibrate (not shown)
	# quantize
	model_fp16 = quantize_fx.convert_fx(model_prepared)

	# run the model, relevant calculations will happen in fp16
	res = model_fp16(test_input)
	print("res.dtype: ", res.dtype) # res.dtype: torch.float32
	fx_g = model_fp16
	# print("model_fp16: ", model_fp16.graph)
	# print("########model_fp16: ", model_fp16)
	# model_fp16.print_readable()
	# print("model_fp16: ", model_fp16.code)

	# Maybe try to print the weight with fp16_repr() ????
	# https://discuss.pytorch.org/t/how-to-print-the-weight-after-torch-quantization-convert/161801
	# model_fp16.layer[0].weight().int_repr().data


	ts_g = torch.jit.script(fx_g)
	# print("ts_g: ", ts_g.graph)

	# import torch_mlir
	# module = torch_mlir.compile(
	# ts_g, [test_input], torch_mlir.OutputType.LINALG_ON_TENSORS, use_tracing=True, verbose=True
	# )
	# # import pdb
	# # pdb.set_trace()
	# # module.dump()

	# from shark.shark_inference import SharkInference


	# mlir_model = module
	# func_name = "forward"

	# shark_module = SharkInference(
	# mlir_model, func_name, device="cpu", mlir_dialect="tm_tensor"
	# )
	# shark_module.compile()

	# def shark_result(x):
	# x_ny = x.detach().numpy()
	# inputs = (x_ny,)
	# result = shark_module.forward(inputs)
	# return torch.from_numpy(result)

	# observed_out = shark_result(test_input)

	# print(actual_out, observed_out)
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[3.5580e-33, 0.0000e+00]])
	# QUANTIZATION
	# test1: static quantization:
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[ 7.2043, -17.0265]])
	# test2: dynamic quantization:
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[ 7.2043, -17.0265]])
	# test3:quantize_fx :
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-5.0346, -3.0476]])
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-8.9322, -2.2289]])
	# tensor([[ 7.2041, -17.0263]], grad_fn=<IndexBackward0>) tensor([[-7.9967, -6.8403]])