0312birdzhang · October 29, 2025 01:47
diff --git a/gistfile1.txt b/gistfile1.txt
 comma@comma-37975f2f:/data/openpilot/tinygrad_repo$ PYTHONPATH="." QCOM=1 IMAGE=0 python3 examples/openpilot/compile3.py ../selfdrive/modeld/models/driving_policy.onnx 
 loaded model
 created tensors
 run 0
 /data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input desire has mismatch on dtype. Expected dtypes.half, received dtypes.float.
  if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
 /data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input traffic_convention has mismatch on dtype. Expected dtypes.half, received dtypes.float.
  if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
 /data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input features_buffer has mismatch on dtype. Expected dtypes.half, received dtypes.float.
  if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
 scheduled 141 kernels in 703.91 ms
 run 1
 scheduled 53 kernels in 409.48 ms
 JIT captured 53 kernels with 3 inputs
 pruned from 53 -> 38 kernels
 JIT memory reduced from 0.00 MB -> 0.01 MB, 6 -> 1 bufs
 run 2
 JIT GRAPHing batch with 35 kernels on device <tinygrad.runtime.ops_qcom.QCOMDevice object at 0x7f8840fbf0>
 *** QCOM       1 copy      800,    QCOM <- NPY                arg  2 mem  0.03 GB tm    155.31us/     0.16ms (     0.00 GFLOPS    0.0|0.0     GB/s) 
 *** QCOM       2 copy        8,    QCOM <- NPY                arg  2 mem  0.03 GB tm     68.85us/     0.22ms (     0.00 GFLOPS    0.0|0.0     GB/s) 
 *** QCOM       3 copy    51200,    QCOM <- NPY                arg  2 mem  0.03 GB tm     77.13us/     0.30ms (     0.00 GFLOPS    0.7|0.7     GB/s) 
 *** QCOM       4 <batched 35>                                 arg  3 mem  0.03 GB tm   5905.95us/     6.21ms (    13.02 GFLOPS    3.4|33.5    GB/s) 
 captured 38 kernels
 jit run validated
 kernel_count=35,  read_image_count=0, gated_read_image_count=0
 mdl size is 12.34M
 pkl size is 19.87M
 **** compile done ****
 enqueue  48.63 ms -- total run  54.63 ms
 enqueue   1.10 ms -- total run   7.02 ms
 enqueue   1.02 ms -- total run   6.95 ms
 enqueue   1.24 ms -- total run   7.18 ms
 enqueue   1.15 ms -- total run   7.05 ms
 enqueue   1.00 ms -- total run   6.95 ms
 enqueue   1.26 ms -- total run   7.27 ms
 enqueue   1.25 ms -- total run   7.15 ms
 enqueue   1.04 ms -- total run   6.98 ms
 enqueue   1.33 ms -- total run   7.39 ms
 enqueue   1.27 ms -- total run   7.26 ms
 enqueue   1.31 ms -- total run   7.23 ms
 enqueue   1.46 ms -- total run   7.60 ms
 enqueue   1.25 ms -- total run   7.36 ms
 enqueue   1.39 ms -- total run   7.26 ms
 enqueue   1.49 ms -- total run   7.54 ms
 enqueue   0.97 ms -- total run   6.97 ms
 enqueue   1.06 ms -- total run   7.25 ms
 enqueue   1.89 ms -- total run   7.90 ms
 enqueue   2.02 ms -- total run   7.67 ms

 comma@comma-37975f2f:/data/openpilot/tinygrad_repo$ PYTHONPATH="." GPU=1 IMAGE=0 python3 examples/openpilot/compile3.py ../selfdrive/modeld/models/driving_policy.onnx 
 loaded model
 created tensors
 run 0
 /data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input desire has mismatch on dtype. Expected dtypes.half, received dtypes.float.
  if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
 /data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input traffic_convention has mismatch on dtype. Expected dtypes.half, received dtypes.float.
  if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
 /data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input features_buffer has mismatch on dtype. Expected dtypes.half, received dtypes.float.
  if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
 scheduled 141 kernels in 693.93 ms
 run 1
 scheduled 53 kernels in 680.94 ms
 JIT captured 53 kernels with 3 inputs
 pruned from 53 -> 38 kernels
 run 2
 jit execs 38 kernels
 *** GPU        1 copy      800,     GPU <- NPY                arg  2 mem  0.03 GB tm    736.42us/     0.74ms (     0.00 GFLOPS    0.0|0.0     GB/s) 
 *** GPU        2 copy        8,     GPU <- NPY                arg  2 mem  0.03 GB tm    581.72us/     1.32ms (     0.00 GFLOPS    0.0|0.0     GB/s) 
 *** GPU        3 copy    51200,     GPU <- NPY                arg  2 mem  0.03 GB tm   1287.30us/     2.61ms (     0.00 GFLOPS    0.0|0.0     GB/s) 
 *** GPU        4 E_202                                        arg  3 mem  0.03 GB tm      6.91us/     2.61ms (     0.03 GFLOPS    0.2|0.4     GB/s) ['cat']
 *** GPU        5 r_3_128_4_3_25_25                            arg  3 mem  0.03 GB tm     28.16us/     2.64ms (    33.41 GFLOPS    2.5|6.4     GB/s) ['__getitem__']
 *** GPU        6 r_128_4_202                                  arg  4 mem  0.03 GB tm     99.07us/     2.74ms (     2.09 GFLOPS    2.1|3.2     GB/s) ['__matmul__', '__rmul__', '__add__']
 *** GPU        7 E_10_128_4                                   arg  3 mem  0.03 GB tm      9.98us/     2.75ms (     1.54 GFLOPS    4.1|6.2     GB/s) ['cat', 'relu']
 *** GPU        8 r_10_128_4_128_4                             arg  4 mem  0.03 GB tm    297.98us/     3.05ms (    17.61 GFLOPS    3.7|44.1    GB/s) ['__add__', 'matmul']
 *** GPU        9 r_10_16_32                                   arg  3 mem  0.03 GB tm     16.13us/     3.06ms (     1.44 GFLOPS    1.9|2.6     GB/s) ['layernorm', '__add__', 'relu']
 *** GPU       10 r_10_16_32n1                                 arg  4 mem  0.03 GB tm     15.10us/     3.08ms (     2.25 GFLOPS    2.0|2.8     GB/s) ['rsqrt', 'add', 'mean', 'square', 'sub', '__add__', 'relu']
 *** GPU       11 E_10_128_4n1                                 arg  7 mem  0.03 GB tm     11.01us/     3.09ms (     3.26 GFLOPS    5.0|9.3     GB/s) ['mul', 'add', 'layernorm', 'sub', '__add__', 'relu']
 *** GPU       12 r_10_512_3_128_4                             arg  4 mem  0.03 GB tm    722.94us/     3.81ms (    21.78 GFLOPS    4.5|58.2    GB/s) ['__add__', 'matmul']
 *** GPU       13 E_8_10_16_4                                  arg  2 mem  0.03 GB tm     12.03us/     3.82ms (     0.43 GFLOPS    3.4|3.4     GB/s) ['squeeze']
 *** GPU       14 E_128_10_4                                   arg  2 mem  0.03 GB tm      9.98us/     3.83ms (     0.51 GFLOPS    4.1|4.1     GB/s) ['permute']
 *** GPU       15 r_8_10_10_16_4                               arg  3 mem  0.03 GB tm     58.88us/     3.89ms (     5.22 GFLOPS    0.7|22.6    GB/s) ['matmul']
 *** GPU       16 r_80_10                                      arg  2 mem  0.03 GB tm      8.19us/     3.90ms (     0.18 GFLOPS    0.4|0.4     GB/s) ['matmul']
 *** GPU       17 r_80_10n1                                    arg  3 mem  0.03 GB tm     11.78us/     3.91ms (     0.27 GFLOPS    0.3|0.3     GB/s) ['softmax', 'matmul']
 *** GPU       18 E_80_10                                      arg  4 mem  0.03 GB tm      8.96us/     3.92ms (     0.36 GFLOPS    0.8|1.4     GB/s) ['softmax', 'matmul']
 *** GPU       19 r_8_10_16_4_10                               arg  3 mem  0.03 GB tm     14.85us/     3.94ms (     6.55 GFLOPS    5.7|18.6    GB/s) ['matmul']
 *** GPU       20 r_10_128_4_128_4n1                           arg  6 mem  0.03 GB tm    297.22us/     4.23ms (    17.73 GFLOPS    3.8|44.3    GB/s) ['__add__', 'relu', 'matmul']
 *** GPU       21 r_10_16_32n2                                 arg  2 mem  0.03 GB tm     11.01us/     4.25ms (     0.71 GFLOPS    1.9|2.9     GB/s) ['layernorm']
 *** GPU       22 r_10_16_32n3                                 arg  3 mem  0.03 GB tm     12.03us/     4.26ms (     1.54 GFLOPS    1.7|2.7     GB/s) ['rsqrt', 'add', 'mean', 'square', 'sub']
 *** GPU       23 E_10_128_4n2                                 arg  6 mem  0.03 GB tm     10.75us/     4.27ms (     1.90 GFLOPS    4.2|8.6     GB/s) ['mul', 'add', 'layernorm', 'sub']
 *** GPU       24 r_10_512_4_128_4                             arg  4 mem  0.03 GB tm    938.75us/     5.21ms (    22.54 GFLOPS    4.6|56.0    GB/s) ['elu', '__add__', 'matmul']
 *** GPU       25 r_10_128_4_512_4                             arg  5 mem  0.03 GB tm   1019.14us/     6.23ms (    20.59 GFLOPS    4.2|51.5    GB/s) ['__add__', 'matmul']
 *** GPU       26 r_1024_16_32                                 arg  4 mem  0.03 GB tm    466.18us/     6.69ms (     2.92 GFLOPS    2.3|9.3     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       27 r_512_16_64                                  arg  5 mem  0.03 GB tm    462.85us/     7.16ms (     2.62 GFLOPS    2.3|8.2     GB/s) ['relu', '__add__', 'transpose', '__matmul__', '__rmul__']
 *** GPU       28 r_1024_16_32n1                               arg  4 mem  0.03 GB tm    467.20us/     7.62ms (     2.91 GFLOPS    2.3|9.3     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       29 r_512_16_64n1                                arg  5 mem  0.03 GB tm    463.87us/     8.09ms (     2.61 GFLOPS    2.3|8.2     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       30 r_256_16_32                                  arg  4 mem  0.03 GB tm    124.16us/     8.21ms (     2.74 GFLOPS    2.1|8.8     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       31 r_32_16_32                                   arg  4 mem  0.03 GB tm     25.09us/     8.24ms (     1.69 GFLOPS    1.4|5.4     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       32 r_256_16_16                                  arg  4 mem  0.03 GB tm     70.91us/     8.31ms (     2.95 GFLOPS    1.9|9.8     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       33 r_32_16_2                                    arg  4 mem  0.03 GB tm      7.94us/     8.31ms (     1.48 GFLOPS    0.3|5.5     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       34 r_256_16_16n1                                arg  5 mem  0.03 GB tm     69.89us/     8.38ms (     3.05 GFLOPS    1.9|10.2    GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       35 r_32_16_2n1                                  arg  5 mem  0.03 GB tm      9.98us/     8.39ms (     1.23 GFLOPS    0.2|4.6     GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
 *** GPU       36 r_990_16_16                                  arg  5 mem  0.03 GB tm    248.06us/     8.64ms (     3.19 GFLOPS    2.1|11.0    GB/s) ['__add__', 'mul', '__matmul__', '__rmul__']
 *** GPU       37 r_8_16_2                                     arg  4 mem  0.03 GB tm      9.22us/     8.65ms (     0.29 GFLOPS    0.1|1.2     GB/s) ['__matmul__', '__rmul__', '__add__']
 *** GPU       38 E_250_4                                      arg  4 mem  0.03 GB tm     11.26us/     8.66ms (     0.13 GFLOPS    0.7|1.2     GB/s) ['cat']
 captured 38 kernels
 jit run validated
 kernel_count=35,  read_image_count=0, gated_read_image_count=0
 mdl size is 12.34M
 pkl size is 19.86M
 **** compile done ****
 enqueue   9.76 ms -- total run  13.37 ms
 enqueue   7.95 ms -- total run  11.91 ms
 enqueue   8.28 ms -- total run  12.10 ms
 enqueue   7.61 ms -- total run  11.21 ms
 enqueue   7.42 ms -- total run  10.99 ms
 enqueue   7.40 ms -- total run  11.01 ms
 enqueue   7.64 ms -- total run  11.22 ms
 enqueue   7.49 ms -- total run  11.11 ms
 enqueue   7.42 ms -- total run  10.99 ms
 enqueue   7.94 ms -- total run  11.60 ms
 enqueue   7.42 ms -- total run  10.99 ms
 enqueue   7.45 ms -- total run  11.07 ms
 enqueue   7.38 ms -- total run  11.09 ms
 enqueue   7.49 ms -- total run  11.18 ms
 enqueue   7.45 ms -- total run  11.01 ms
 enqueue   7.82 ms -- total run  11.43 ms
 enqueue   7.46 ms -- total run  11.11 ms
 enqueue   7.39 ms -- total run  11.02 ms
 enqueue   7.47 ms -- total run  11.07 ms
 enqueue   7.86 ms -- total run  11.56 ms
 <Tensor <UOp GPU (1, 1000) float ShapeTracker(views=(View(shape=(1, 1000), strides=(0, 1), offset=0, mask=None, contiguous=True),))> on GPU with grad None> (1, 1000) float32
 **** test done ****
	comma@comma-37975f2f:/data/openpilot/tinygrad_repo$ PYTHONPATH="." QCOM=1 IMAGE=0 python3 examples/openpilot/compile3.py ../selfdrive/modeld/models/driving_policy.onnx
	loaded model
	created tensors
	run 0
	/data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input desire has mismatch on dtype. Expected dtypes.half, received dtypes.float.
	if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
	/data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input traffic_convention has mismatch on dtype. Expected dtypes.half, received dtypes.float.
	if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
	/data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input features_buffer has mismatch on dtype. Expected dtypes.half, received dtypes.float.
	if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
	scheduled 141 kernels in 703.91 ms
	run 1
	scheduled 53 kernels in 409.48 ms
	JIT captured 53 kernels with 3 inputs
	pruned from 53 -> 38 kernels
	JIT memory reduced from 0.00 MB -> 0.01 MB, 6 -> 1 bufs
	run 2
	JIT GRAPHing batch with 35 kernels on device <tinygrad.runtime.ops_qcom.QCOMDevice object at 0x7f8840fbf0>
	*** QCOM 1 copy 800, QCOM <- NPY arg 2 mem 0.03 GB tm 155.31us/ 0.16ms ( 0.00 GFLOPS 0.0\|0.0 GB/s)
	*** QCOM 2 copy 8, QCOM <- NPY arg 2 mem 0.03 GB tm 68.85us/ 0.22ms ( 0.00 GFLOPS 0.0\|0.0 GB/s)
	*** QCOM 3 copy 51200, QCOM <- NPY arg 2 mem 0.03 GB tm 77.13us/ 0.30ms ( 0.00 GFLOPS 0.7\|0.7 GB/s)
	*** QCOM 4 <batched 35> arg 3 mem 0.03 GB tm 5905.95us/ 6.21ms ( 13.02 GFLOPS 3.4\|33.5 GB/s)
	captured 38 kernels
	jit run validated
	kernel_count=35, read_image_count=0, gated_read_image_count=0
	mdl size is 12.34M
	pkl size is 19.87M
	** compile done **
	enqueue 48.63 ms -- total run 54.63 ms
	enqueue 1.10 ms -- total run 7.02 ms
	enqueue 1.02 ms -- total run 6.95 ms
	enqueue 1.24 ms -- total run 7.18 ms
	enqueue 1.15 ms -- total run 7.05 ms
	enqueue 1.00 ms -- total run 6.95 ms
	enqueue 1.26 ms -- total run 7.27 ms
	enqueue 1.25 ms -- total run 7.15 ms
	enqueue 1.04 ms -- total run 6.98 ms
	enqueue 1.33 ms -- total run 7.39 ms
	enqueue 1.27 ms -- total run 7.26 ms
	enqueue 1.31 ms -- total run 7.23 ms
	enqueue 1.46 ms -- total run 7.60 ms
	enqueue 1.25 ms -- total run 7.36 ms
	enqueue 1.39 ms -- total run 7.26 ms
	enqueue 1.49 ms -- total run 7.54 ms
	enqueue 0.97 ms -- total run 6.97 ms
	enqueue 1.06 ms -- total run 7.25 ms
	enqueue 1.89 ms -- total run 7.90 ms
	enqueue 2.02 ms -- total run 7.67 ms

	comma@comma-37975f2f:/data/openpilot/tinygrad_repo$ PYTHONPATH="." GPU=1 IMAGE=0 python3 examples/openpilot/compile3.py ../selfdrive/modeld/models/driving_policy.onnx
	loaded model
	created tensors
	run 0
	/data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input desire has mismatch on dtype. Expected dtypes.half, received dtypes.float.
	if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
	/data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input traffic_convention has mismatch on dtype. Expected dtypes.half, received dtypes.float.
	if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
	/data/openpilot/tinygrad_repo/extra/onnx.py:375: UserWarning: input features_buffer has mismatch on dtype. Expected dtypes.half, received dtypes.float.
	if tensor.dtype is not spec.dtype: warnings.warn(f"input {name} has mismatch on dtype. Expected {spec.dtype}, received {tensor.dtype}.")
	scheduled 141 kernels in 693.93 ms
	run 1
	scheduled 53 kernels in 680.94 ms
	JIT captured 53 kernels with 3 inputs
	pruned from 53 -> 38 kernels
	run 2
	jit execs 38 kernels
	*** GPU 1 copy 800, GPU <- NPY arg 2 mem 0.03 GB tm 736.42us/ 0.74ms ( 0.00 GFLOPS 0.0\|0.0 GB/s)
	*** GPU 2 copy 8, GPU <- NPY arg 2 mem 0.03 GB tm 581.72us/ 1.32ms ( 0.00 GFLOPS 0.0\|0.0 GB/s)
	*** GPU 3 copy 51200, GPU <- NPY arg 2 mem 0.03 GB tm 1287.30us/ 2.61ms ( 0.00 GFLOPS 0.0\|0.0 GB/s)
	*** GPU 4 E_202 arg 3 mem 0.03 GB tm 6.91us/ 2.61ms ( 0.03 GFLOPS 0.2\|0.4 GB/s) ['cat']
	*** GPU 5 r_3_128_4_3_25_25 arg 3 mem 0.03 GB tm 28.16us/ 2.64ms ( 33.41 GFLOPS 2.5\|6.4 GB/s) ['__getitem__']
	*** GPU 6 r_128_4_202 arg 4 mem 0.03 GB tm 99.07us/ 2.74ms ( 2.09 GFLOPS 2.1\|3.2 GB/s) ['__matmul__', '__rmul__', '__add__']
	*** GPU 7 E_10_128_4 arg 3 mem 0.03 GB tm 9.98us/ 2.75ms ( 1.54 GFLOPS 4.1\|6.2 GB/s) ['cat', 'relu']
	*** GPU 8 r_10_128_4_128_4 arg 4 mem 0.03 GB tm 297.98us/ 3.05ms ( 17.61 GFLOPS 3.7\|44.1 GB/s) ['__add__', 'matmul']
	*** GPU 9 r_10_16_32 arg 3 mem 0.03 GB tm 16.13us/ 3.06ms ( 1.44 GFLOPS 1.9\|2.6 GB/s) ['layernorm', '__add__', 'relu']
	*** GPU 10 r_10_16_32n1 arg 4 mem 0.03 GB tm 15.10us/ 3.08ms ( 2.25 GFLOPS 2.0\|2.8 GB/s) ['rsqrt', 'add', 'mean', 'square', 'sub', '__add__', 'relu']
	*** GPU 11 E_10_128_4n1 arg 7 mem 0.03 GB tm 11.01us/ 3.09ms ( 3.26 GFLOPS 5.0\|9.3 GB/s) ['mul', 'add', 'layernorm', 'sub', '__add__', 'relu']
	*** GPU 12 r_10_512_3_128_4 arg 4 mem 0.03 GB tm 722.94us/ 3.81ms ( 21.78 GFLOPS 4.5\|58.2 GB/s) ['__add__', 'matmul']
	*** GPU 13 E_8_10_16_4 arg 2 mem 0.03 GB tm 12.03us/ 3.82ms ( 0.43 GFLOPS 3.4\|3.4 GB/s) ['squeeze']
	*** GPU 14 E_128_10_4 arg 2 mem 0.03 GB tm 9.98us/ 3.83ms ( 0.51 GFLOPS 4.1\|4.1 GB/s) ['permute']
	*** GPU 15 r_8_10_10_16_4 arg 3 mem 0.03 GB tm 58.88us/ 3.89ms ( 5.22 GFLOPS 0.7\|22.6 GB/s) ['matmul']
	*** GPU 16 r_80_10 arg 2 mem 0.03 GB tm 8.19us/ 3.90ms ( 0.18 GFLOPS 0.4\|0.4 GB/s) ['matmul']
	*** GPU 17 r_80_10n1 arg 3 mem 0.03 GB tm 11.78us/ 3.91ms ( 0.27 GFLOPS 0.3\|0.3 GB/s) ['softmax', 'matmul']
	*** GPU 18 E_80_10 arg 4 mem 0.03 GB tm 8.96us/ 3.92ms ( 0.36 GFLOPS 0.8\|1.4 GB/s) ['softmax', 'matmul']
	*** GPU 19 r_8_10_16_4_10 arg 3 mem 0.03 GB tm 14.85us/ 3.94ms ( 6.55 GFLOPS 5.7\|18.6 GB/s) ['matmul']
	*** GPU 20 r_10_128_4_128_4n1 arg 6 mem 0.03 GB tm 297.22us/ 4.23ms ( 17.73 GFLOPS 3.8\|44.3 GB/s) ['__add__', 'relu', 'matmul']
	*** GPU 21 r_10_16_32n2 arg 2 mem 0.03 GB tm 11.01us/ 4.25ms ( 0.71 GFLOPS 1.9\|2.9 GB/s) ['layernorm']
	*** GPU 22 r_10_16_32n3 arg 3 mem 0.03 GB tm 12.03us/ 4.26ms ( 1.54 GFLOPS 1.7\|2.7 GB/s) ['rsqrt', 'add', 'mean', 'square', 'sub']
	*** GPU 23 E_10_128_4n2 arg 6 mem 0.03 GB tm 10.75us/ 4.27ms ( 1.90 GFLOPS 4.2\|8.6 GB/s) ['mul', 'add', 'layernorm', 'sub']
	*** GPU 24 r_10_512_4_128_4 arg 4 mem 0.03 GB tm 938.75us/ 5.21ms ( 22.54 GFLOPS 4.6\|56.0 GB/s) ['elu', '__add__', 'matmul']
	*** GPU 25 r_10_128_4_512_4 arg 5 mem 0.03 GB tm 1019.14us/ 6.23ms ( 20.59 GFLOPS 4.2\|51.5 GB/s) ['__add__', 'matmul']
	*** GPU 26 r_1024_16_32 arg 4 mem 0.03 GB tm 466.18us/ 6.69ms ( 2.92 GFLOPS 2.3\|9.3 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 27 r_512_16_64 arg 5 mem 0.03 GB tm 462.85us/ 7.16ms ( 2.62 GFLOPS 2.3\|8.2 GB/s) ['relu', '__add__', 'transpose', '__matmul__', '__rmul__']
	*** GPU 28 r_1024_16_32n1 arg 4 mem 0.03 GB tm 467.20us/ 7.62ms ( 2.91 GFLOPS 2.3\|9.3 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 29 r_512_16_64n1 arg 5 mem 0.03 GB tm 463.87us/ 8.09ms ( 2.61 GFLOPS 2.3\|8.2 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 30 r_256_16_32 arg 4 mem 0.03 GB tm 124.16us/ 8.21ms ( 2.74 GFLOPS 2.1\|8.8 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 31 r_32_16_32 arg 4 mem 0.03 GB tm 25.09us/ 8.24ms ( 1.69 GFLOPS 1.4\|5.4 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 32 r_256_16_16 arg 4 mem 0.03 GB tm 70.91us/ 8.31ms ( 2.95 GFLOPS 1.9\|9.8 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 33 r_32_16_2 arg 4 mem 0.03 GB tm 7.94us/ 8.31ms ( 1.48 GFLOPS 0.3\|5.5 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 34 r_256_16_16n1 arg 5 mem 0.03 GB tm 69.89us/ 8.38ms ( 3.05 GFLOPS 1.9\|10.2 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 35 r_32_16_2n1 arg 5 mem 0.03 GB tm 9.98us/ 8.39ms ( 1.23 GFLOPS 0.2\|4.6 GB/s) ['relu', '__add__', '__matmul__', '__rmul__']
	*** GPU 36 r_990_16_16 arg 5 mem 0.03 GB tm 248.06us/ 8.64ms ( 3.19 GFLOPS 2.1\|11.0 GB/s) ['__add__', 'mul', '__matmul__', '__rmul__']
	*** GPU 37 r_8_16_2 arg 4 mem 0.03 GB tm 9.22us/ 8.65ms ( 0.29 GFLOPS 0.1\|1.2 GB/s) ['__matmul__', '__rmul__', '__add__']
	*** GPU 38 E_250_4 arg 4 mem 0.03 GB tm 11.26us/ 8.66ms ( 0.13 GFLOPS 0.7\|1.2 GB/s) ['cat']
	captured 38 kernels
	jit run validated
	kernel_count=35, read_image_count=0, gated_read_image_count=0
	mdl size is 12.34M
	pkl size is 19.86M
	** compile done **
	enqueue 9.76 ms -- total run 13.37 ms
	enqueue 7.95 ms -- total run 11.91 ms
	enqueue 8.28 ms -- total run 12.10 ms
	enqueue 7.61 ms -- total run 11.21 ms
	enqueue 7.42 ms -- total run 10.99 ms
	enqueue 7.40 ms -- total run 11.01 ms
	enqueue 7.64 ms -- total run 11.22 ms
	enqueue 7.49 ms -- total run 11.11 ms
	enqueue 7.42 ms -- total run 10.99 ms
	enqueue 7.94 ms -- total run 11.60 ms
	enqueue 7.42 ms -- total run 10.99 ms
	enqueue 7.45 ms -- total run 11.07 ms
	enqueue 7.38 ms -- total run 11.09 ms
	enqueue 7.49 ms -- total run 11.18 ms
	enqueue 7.45 ms -- total run 11.01 ms
	enqueue 7.82 ms -- total run 11.43 ms
	enqueue 7.46 ms -- total run 11.11 ms
	enqueue 7.39 ms -- total run 11.02 ms
	enqueue 7.47 ms -- total run 11.07 ms
	enqueue 7.86 ms -- total run 11.56 ms
	<Tensor <UOp GPU (1, 1000) float ShapeTracker(views=(View(shape=(1, 1000), strides=(0, 1), offset=0, mask=None, contiguous=True),))> on GPU with grad None> (1, 1000) float32
	** test done **