pashu123’s gists

pashu123 / asm_broadcast.s

Created April 24, 2024 13:03

	.text
	.intel_syntax noprefix
	.file "broadcast_dispatch_0"
	.section .text.broadcast_dispatch_0_generic_Dx8640x3200_f16,"ax",@progbits
	.p2align 4, 0x90
	.type broadcast_dispatch_0_generic_Dx8640x3200_f16,@function
	broadcast_dispatch_0_generic_Dx8640x3200_f16:
	.Lfunc_begin0:
	.file 1 "-"
	.loc 1 1 0

pashu123 / acos.py

Created April 26, 2024 11:08

	import math

	def fma(a, b, c):
	return a * b + c

	def asin_core(a):
	s = a * a
	q = s * s
	r = 5.5579749017470502e-2
	t = -6.2027913464120114e-2

pashu123 / batch_llama_3_8B.mlir

Created April 30, 2024 16:12

	#map = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	module @module {
	util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xf16>
	util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xf32>
	util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x4096xf16>
	util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x4096xf16>
	util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x4096xf16>
	util.global private @__auto.constant_8192_64_torch.complex64 = dense_resource<__auto.constant_8192_64_torch.complex64> : tensor<8192x64xcomplex<f32>>
	util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_o

pashu123 / embedding_to_f16.mlir

Created April 30, 2024 17:00

	module {
	func.func @decode_bs4(%arg0: !torch.vtensor<[4,?],si64>, %arg1: !torch.vtensor<[128256,4096],f16>) -> !torch.vtensor<[4,?,4096],f32> {
	%false = torch.constant.bool false
	%false_0 = torch.constant.bool false
	%int-1 = torch.constant.int -1
	%int6 = torch.constant.int 6
	%0 = torch.prims.convert_element_type %arg1, %int6 : !torch.vtensor<[128256,4096],f16>, !torch.int -> !torch.vtensor<[128256,4096],f32>
	%1 = torch.aten.embedding %0, %arg0, %int-1, %false_0, %false : !torch.vtensor<[128256,4096],f32>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[4,?,4096],f32>
	return %1 : !torch.vtensor<[4,?,4096],f32>
	}

pashu123 / cos.py

Created May 7, 2024 07:33

	import math

	def fma(a, b, c):
	return a * b + c

	def asin_core(a):
	s = a * a
	q = s * s
	r = 5.5579749017470502e-2
	t = -6.2027913464120114e-2

pashu123 / test_vals

Created May 7, 2024 07:34

[-1.0, -0.9591836734693877, -0.9183673469387755, -0.8775510204081632, -0.8367346938775511, -0.7959183673469388, -0.7551020408163265, -0.7142857142857143, -0.6734693877551021, -0.6326530612244898, -0.5918367346938775, -0.5510204081632653, -0.5102040816326531, -0.4693877551020408, -0.4285714285714286, -0.3877551020408163, -0.34693877551020413, -0.30612244897959184, -0.26530612244897955, -0.22448979591836737, -0.18367346938775508, -0.1428571428571429, -0.10204081632653061, -0.061224489795918324, -0.020408163265306145, 0.020408163265306145, 0.061224489795918435, 0.1020408163265305, 0.1428571428571428, 0.18367346938775508, 0.22448979591836737, 0.26530612244897966, 0.30612244897959173, 0.346938775510204, 0.3877551020408163, 0.4285714285714286, 0.4693877551020409, 0.510204081632653, 0.5510204081632653, 0.5918367346938775, 0.6326530612244898, 0.6734693877551021, 0.7142857142857142, 0.7551020408163265, 0.7959183673469388, 0.8367346938775511, 0.8775510204081634, 0.9183673469387754, 0.9591836734693877, 1.0]

Actual: 3.14

pashu123 / xyz.mlir

Created May 8, 2024 06:10

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
	#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
	#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
	module attributes {hal.device.targets = [#device_target_local]} {
	func.func @torch.prims.convert_element_type$fold(%arg0: !torch.vtensor<[4,?,32,100],f32>, %arg1: !torch.vtensor<[4,?],si64>) -> !torch.vtensor<[4,?,32,50,2],f32> {
	%int4 = torch.constant.int 4
	%int32 = torch.constant.int 32
	%int-1 = torch.constant.int -1
	%int2 = torch.constant.int 2
	%int1 = torch.constant.int 1

pashu123 / test.mlir

Created May 8, 2024 07:57

	func.func @torch_add(%arg0: !torch.vtensor<[1,1,?,?],i1>, %arg1: !torch.vtensor<[4,1,1,?],i1>) -> !torch.vtensor<[4, 1, ?, ?],i1> {
	%int1 = torch.constant.int 1
	%2 = torch.aten.add.Tensor %arg0, %arg1, %int1 : !torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,?,?],i1>
	return %2 : !torch.vtensor<[4,1,?,?],i1>
	}

pashu123 / convert_param_dtype.py

Created May 14, 2024 06:06

	import argparse
	import re

	parser = argparse.ArgumentParser(description='Convert parameter data type')

	parser.add_argument('mlir', type=str, help='MLIR file where all parameters are mentioned')
	parser.add_argument('dtype', type=str, help='Required data type of parameters')
	parser.add_argument('irpa', type=str, help='destination irpa file')

	args = parser.parse_args()

pashu123 / im2col.mlir

Last active May 16, 2024 22:11

	func.func @img2col(%arg0: tensor<128x1026x1026xf32>) -> tensor<128x3x3x1024x1024xbf16> {
	%0 = tensor.empty() : tensor<128x3x3x1024x1024xbf16>
	%c1 = arith.constant 1 : index
	%c0 = arith.constant 0 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c128 = arith.constant 128 : index
	%c1024 = arith.constant 1024 : index
	%1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x3x3x1024x1024xbf16>) {
	%2 = scf.for %arg3 = %c0 to %c1024 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x3x3x1024x1024xbf16>) {
	%3 = scf.for %arg5 = %c0 to %c1024 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x3x3x1024x1024xbf16>) {

Prashant Kumar pashu123