evshiron · June 20, 2023 16:20
diff --git a/unit_test.ac1e551.log b/unit_test.ac1e551.log
 ============================= test session starts ==============================
 platform linux -- Python 3.10.6, pytest-7.3.2, pluggy-1.0.0 -- /usr/bin/python3
 cachedir: .pytest_cache
 rootdir: /root/triton/python
 collecting ... collected 4590 items

 python/test/unit/language/test_core_amd.py::test_empty_kernel[int8] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[int16] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[int32] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[int64] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[uint8] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[uint16] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[uint32] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[uint64] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[float16] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[float32] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[float64] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_empty_kernel[bfloat16] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int8-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint8-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-bfloat16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int8-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint8-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-bfloat16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int8-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint8-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint32-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint64-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float16-+] PASSED [  0%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-bfloat16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-bfloat16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-bfloat16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-bfloat16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int64-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint8-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint16-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint32-+] PASSED [  1%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-bfloat16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-bfloat16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-bfloat16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-bfloat16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int16-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int32-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int64-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint8-+] PASSED [  2%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint32-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint64-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float32-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float64-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-bfloat16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int8-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int32-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int64-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint8-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint32-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint64-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float32-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float64-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-bfloat16-+] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int8--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int64--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint8--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint64--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float64--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-bfloat16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int8--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int64--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint8--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint64--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float64--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-bfloat16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int8--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int16--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int32--] PASSED [  3%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-bfloat16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-bfloat16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-bfloat16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float32--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float64--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-bfloat16--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int8--] PASSED [  4%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-bfloat16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-bfloat16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-bfloat16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint8--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float16--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float32--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float64--] PASSED [  5%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-bfloat16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int8--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int32--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int64--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint8--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint32--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint64--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float32--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float64--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-bfloat16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int8--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int32--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int64--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint8--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint32--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint64--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float32--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float64--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-bfloat16--] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int8-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int32-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int64-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint8-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint32-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint64-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float32-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float64-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-bfloat16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int8-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int32-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int64-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint8-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint32-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint64-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float16-*] PASSED [  6%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-bfloat16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-bfloat16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-bfloat16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-bfloat16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int64-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint8-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint16-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint32-*] PASSED [  7%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-bfloat16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-bfloat16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-bfloat16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-bfloat16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int16-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int32-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int64-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint8-*] PASSED [  8%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-bfloat16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int8-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint8-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-bfloat16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int8-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint8-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float32-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float64-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-bfloat16-*] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int8-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int16-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int32-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int64-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint8-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint16-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint32-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint64-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float16-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float32-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float64-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-bfloat16-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int8-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int16-/] PASSED [  9%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-bfloat16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-bfloat16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-bfloat16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint8-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float32-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float64-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-bfloat16-/] PASSED [ 10%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-bfloat16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-bfloat16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-bfloat16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint8-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint64-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float16-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float32-/] PASSED [ 11%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-bfloat16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int8-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint8-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-bfloat16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int8-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint8-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-bfloat16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int8-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint8-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float32-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float64-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-bfloat16-/] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int8-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int16-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int32-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-int64-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint8-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint16-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint32-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-uint64-%] PASSED [ 12%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-float64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int8-bfloat16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-int64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-uint64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-float64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int16-bfloat16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-int64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-uint64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-float64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int32-bfloat16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-int64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-uint64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-float64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[int64-bfloat16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int32-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-int64-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint8-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint16-%] PASSED [ 13%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-uint64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-float64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint8-bfloat16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-int64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-uint64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-float64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint16-bfloat16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-int64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-uint64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-float64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint32-bfloat16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-int64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-uint64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-float64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[uint64-bfloat16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int8-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int16-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int32-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-int64-%] PASSED [ 14%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-uint64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-float64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float16-bfloat16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-int64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-uint64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-float64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float32-bfloat16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-int64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-uint64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-float64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[float64-bfloat16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-int64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint8-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-uint64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float32-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-float64-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_bin_op[bfloat16-bfloat16-%] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int8-int8] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int8-int16] PASSED [ 15%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int8-int32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int8-int64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int16-int8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int16-int16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int16-int32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int16-int64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int32-int8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int32-int16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int32-int32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int32-int64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int64-int8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int64-int16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int64-int32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[int64-int64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint8-uint8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint8-uint16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint8-uint32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint8-uint64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint16-uint8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint16-uint16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint16-uint32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint16-uint64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint32-uint8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint32-uint16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint32-uint32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint32-uint64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint64-uint8] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint64-uint16] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint64-uint32] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_floordiv[uint64-uint64] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_unsigned_name_mangling PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-&0] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-&1] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-&1] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-&1] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-&1] PASSED [ 16%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-bfloat16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-bfloat16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-&0] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-&1] PASSED [ 17%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-bfloat16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-bfloat16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-&0] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-&1] PASSED [ 18%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-bfloat16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-bfloat16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-&0] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-&1] PASSED [ 19%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-bfloat16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-bfloat16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-&0] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-&1] PASSED [ 20%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-bfloat16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-bfloat16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-&0] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-&1] PASSED [ 21%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-bfloat16-&0] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-bfloat16-&1] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-&2] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-&3] PASSED [ 22%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-bfloat16-&1] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-bfloat16-&1] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-&2] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-&3] PASSED [ 23%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-bfloat16-&1] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-bfloat16-&1] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-&2] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-&3] PASSED [ 24%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-bfloat16-&1] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-bfloat16-&1] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-&2] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-&3] PASSED [ 25%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-bfloat16-&1] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-bfloat16-&1] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-&2] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-&3] PASSED [ 26%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-bfloat16-&1] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-&2] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-&3] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-bfloat16-&1] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int8-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int16-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int32-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int64-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint8-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint16-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint32-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint64-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float16-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float32-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float64-&0] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int8-&1] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int16-&1] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int32-&1] PASSED [ 27%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int64-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint8-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint16-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint32-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint64-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float16-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float32-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float64-&1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-bfloat16-&] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-bfloat16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-|0] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-|1] PASSED [ 28%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-bfloat16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-bfloat16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-|0] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-|1] PASSED [ 29%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-bfloat16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-bfloat16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-|0] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-|1] PASSED [ 30%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-bfloat16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-bfloat16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-|0] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-|1] PASSED [ 31%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-bfloat16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-bfloat16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-|0] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-|1] PASSED [ 32%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-bfloat16-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-|1] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-bfloat16-|0] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-|2] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-|3] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-|3] PASSED [ 33%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-bfloat16-|1] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-bfloat16-|1] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-|2] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-|3] PASSED [ 34%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-bfloat16-|1] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-bfloat16-|1] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-|2] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-|3] PASSED [ 35%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-bfloat16-|1] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-bfloat16-|1] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-|2] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-|3] PASSED [ 36%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-bfloat16-|1] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-bfloat16-|1] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-|2] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-|3] PASSED [ 37%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-bfloat16-|1] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-bfloat16-|1] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-|2] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-|3] PASSED [ 38%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-|3] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-bfloat16-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int8-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int16-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int32-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int64-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint8-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint16-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint32-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint64-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float16-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float32-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float64-|0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int8-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int16-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int32-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int64-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint8-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint16-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint32-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint64-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float16-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float32-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float64-|1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-bfloat16-|] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-^0] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-^1] PASSED [ 39%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-bfloat16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-bfloat16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-^0] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-^1] PASSED [ 40%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-bfloat16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-bfloat16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-^0] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-^1] PASSED [ 41%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-bfloat16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-bfloat16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-^0] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-^1] PASSED [ 42%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-bfloat16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-bfloat16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-^0] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-^1] PASSED [ 43%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-bfloat16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-bfloat16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-^0] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-^1] PASSED [ 44%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-bfloat16-^0] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int8-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int16-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int32-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-int64-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint8-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint16-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint32-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-uint64-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float16-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float32-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-float64-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int8-bfloat16-^1] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-^2] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int8-^3] PASSED [ 45%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int16-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int32-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-int64-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint8-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint16-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint32-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-uint64-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float16-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float32-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-float64-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int16-bfloat16-^1] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int8-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int16-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int32-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-int64-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint8-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint16-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint32-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-uint64-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float16-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float32-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-float64-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int32-bfloat16-^1] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-^2] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int8-^3] PASSED [ 46%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int16-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int32-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-int64-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint8-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint16-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint32-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-uint64-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float16-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float32-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-float64-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[int64-bfloat16-^1] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int8-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int16-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int32-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-int64-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint8-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint16-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint32-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-uint64-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float16-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float32-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-float64-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint8-bfloat16-^1] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-^2] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int8-^3] PASSED [ 47%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int16-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int32-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-int64-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint8-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint16-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint32-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-uint64-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float16-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float32-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-float64-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint16-bfloat16-^1] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int8-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int16-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int32-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-int64-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint8-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint16-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint32-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-uint64-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float16-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float32-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-float64-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint32-bfloat16-^1] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-^2] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int8-^3] PASSED [ 48%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int16-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int32-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-int64-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint8-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint16-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint32-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-uint64-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float16-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float32-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-float64-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[uint64-bfloat16-^1] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int8-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int16-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int32-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-int64-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint8-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint16-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint32-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-uint64-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float16-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float32-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-float64-^3] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float16-bfloat16-^1] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-^2] PASSED [ 49%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int8-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int16-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int32-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-int64-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint8-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint16-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint32-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-uint64-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float16-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float32-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-float64-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float32-bfloat16-^1] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-^2] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int8-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int16-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int32-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-int64-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint8-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint16-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint32-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-uint64-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float16-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float32-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-float64-^3] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[float64-bfloat16-^1] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int8-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int16-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int32-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int64-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint8-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint16-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint32-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint64-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float16-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float32-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float64-^0] PASSED [ 50%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int8-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int16-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int32-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-int64-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint8-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint16-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint32-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-uint64-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float16-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float32-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-float64-^1] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_bitwise_op[bfloat16-bfloat16-^] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint32-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint64-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int8-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int16-<<] PASSED [ 51%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint8-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint16-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint32-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint64-<<] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int8->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int16->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int32->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-int64->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint8->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint16->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint32->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int8-uint64->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int8->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int16->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int32->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-int64->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint8->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint16->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint32->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int16-uint64->>] PASSED [ 52%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-int64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int32-uint64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-int64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[int64-uint64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-int64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint8-uint64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-int64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint16-uint64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-int64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint32-uint64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int32->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-int64->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint8->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint16->>] PASSED [ 53%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint32->>] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_shift_op[uint64-uint64->>] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint8-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float16-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float32-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float64-==-real-real] PASSED [ 54%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float32-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float64-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int8-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int16-==-real-real] PASSED [ 55%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint8-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int8-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint8-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int8-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint8-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float16-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float32-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float64-==-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int8-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int16-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int32-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int64-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint8-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint16-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint32-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint64-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float16-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float32-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float64-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int8-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int16-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int32-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int64-!=-real-real] PASSED [ 56%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int32-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int64-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint8-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint16-!=-real-real] PASSED [ 57%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint8-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint16-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint32-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint64-!=-real-real] PASSED [ 58%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float16-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float64-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int8-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int16-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int32-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int64-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint8-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint16-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint32-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint64-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float16-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float32-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float64-!=-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int8->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint8->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int8->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint8->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int8->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint8->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint32->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint64->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float16->-real-real] PASSED [ 59%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint8->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float16->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float32->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float64->-real-real] PASSED [ 60%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint8->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float16->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float32->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float64->-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int8-<-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int16-<-real-real] PASSED [ 61%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int8-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int16-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int32-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int64-<-real-real] PASSED [ 62%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int32-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int64-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint8-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint16-<-real-real] PASSED [ 63%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int8-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint8-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int8-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint8-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float16-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float32-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float64-<-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int8->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int16->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int32->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int64->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint8->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint16->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint32->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint64->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float16->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float32->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float64->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int8->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int16->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int32->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int64->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint8->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint16->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint32->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint64->=-real-real] PASSED [ 64%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint8->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint64->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float16->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float32->=-real-real] PASSED [ 65%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float16->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float64->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int8->=-real-real] PASSED [ 66%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int16->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int32->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int64->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint8->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint16->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint32->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint64->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float16->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float32->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float64->=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-int64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-uint64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int8-float64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-int64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-uint64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int16-float64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-int64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-uint64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int32-float64-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int8-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int16-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int32-<=-real-real] PASSED [ 67%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-int64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-uint64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[int64-float64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-int64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-uint64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint8-float64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-int64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-uint64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint16-float64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-int64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-uint64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint32-float64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int16-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int32-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-int64-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint8-<=-real-real] PASSED [ 68%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-uint64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[uint64-float64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int8-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-int64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint8-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-uint64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float16-float64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int8-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-int64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint8-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-uint64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int8-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-int64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint8-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-uint64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float16-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float32-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float64-float64-<=-real-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-==-nan-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-==-real-nan] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-==-nan-nan] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-!=-nan-real] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-!=-real-nan] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-!=-nan-nan] PASSED [ 69%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->-nan-real] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->-real-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->-nan-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<-nan-real] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<-real-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<-nan-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->=-nan-real] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->=-real-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32->=-nan-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<=-nan-real] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<=-real-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_compare_op[float32-float32-<=-nan-nan] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[int8] PASSED      [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[int16] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[int32] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[int64] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[uint8] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[uint16] PASSED    [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[uint32] PASSED    [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[uint64] PASSED    [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[float16] PASSED   [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[float32] PASSED   [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[float64] PASSED   [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[bfloat16] PASSED  [ 70%]
 python/test/unit/language/test_core_amd.py::test_where[*int32] PASSED    [ 70%]
 python/test/unit/language/test_core_amd.py::test_where_broadcast PASSED  [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int8- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int16- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int32- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int64- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[uint8- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[uint16- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[uint32- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[uint64- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[float16- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[float32- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[float64- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[bfloat16- -x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int8- ~x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int16- ~x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int32- ~x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_unary_op[int64- ~x] PASSED [ 70%]
 python/test/unit/language/test_core_amd.py::test_math_op[exp] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_math_op[log] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_math_op[cos] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_math_op[sin] PASSED     [ 70%]
 python/test/unit/language/test_core_amd.py::test_abs[int8] PASSED        [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[int16] PASSED       [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[int32] PASSED       [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[int64] PASSED       [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[uint8] PASSED       [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[uint16] PASSED      [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[uint32] PASSED      [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[uint64] PASSED      [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[float16] PASSED     [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[float32] PASSED     [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[float64] PASSED     [ 71%]
 python/test/unit/language/test_core_amd.py::test_abs[bfloat16] PASSED    [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[None, :]-int32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[None, :]-uint32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[None, :]-uint16] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[:, None]-int32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[:, None]-uint32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[:, None]-uint16] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[None, :, :]-int32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[None, :, :]-uint32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[None, :, :]-uint16] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[:, :, None]-int32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[:, :, None]-uint32] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_index1d[x[:, :, None]-uint16] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_tuples PASSED           [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float16-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-uint32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-int32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-uint32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-int32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-float32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-uint32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-int32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-float32-all_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float16-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-uint32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-int32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-uint32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-int32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-float32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-uint32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-int32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-float32-all_pos] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float16-min_neg] PASSED [ 71%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-uint32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-int32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-uint32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-int32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-float32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-uint32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-int32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-float32-min_neg] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float16-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-uint32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-int32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[add-float32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-uint32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-int32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[max-float32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-uint32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-int32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_rmw[min-float32-max_pos] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape0-0] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape1-1] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape2-0] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape3-1] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape4-0] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape5-1] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape6-0] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape7-1] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape8-0] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw[shape9-1] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_tensor_atomic_rmw_block PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_atomic_cas PASSED       [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-int8-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-int16-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-int32-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-int64-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-uint8-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-uint16-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-uint32-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-uint64-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-float16-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-float32-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-float64-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-int8-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-int16-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-int32-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-int64-False] PASSED [ 72%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-uint8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-uint16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-uint32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-uint64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-float16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-float32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-float64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-int8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-int16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-int32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-int64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-uint8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-uint16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-uint32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-uint64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-float16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-float32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-float64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-int8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-int16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-int32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-int64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-uint8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-uint16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-uint32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-uint64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-float16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-float32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-float64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-int8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-int16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-int32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-int64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-uint8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-uint16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-uint32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-uint64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-float16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-float32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-float64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-int8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-int16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-int32-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-int64-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-uint8-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-uint16-False] PASSED [ 73%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-uint32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-uint64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-float16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-float32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-float64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-int8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-int16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-int32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-int64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-uint8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-uint16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-uint32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-uint64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-float16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-float32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-float64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-int8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-int16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-int32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-int64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-uint8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-uint16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-uint32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-uint64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-float16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-float32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-float64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-int8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-int16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-int32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-int64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-uint8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-uint16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-uint32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-uint64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-float16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-float32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float16-float64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-int8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-int16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-int32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-int64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-uint8-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-uint16-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-uint32-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-uint64-False] PASSED [ 74%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-float16-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-float32-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-float64-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-int8-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-int16-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-int32-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-int64-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-uint8-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-uint16-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-uint32-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-uint64-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-float16-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-float32-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float64-float64-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-bfloat16-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[bfloat16-float32-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-int32-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[float32-int1-False] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[uint8-int8-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[uint16-int16-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[uint32-int32-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[uint64-int64-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[int8-uint8-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[int16-uint16-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[int32-uint32-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_cast[int64-uint64-True] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[bool] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[int8] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[int16] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[int32] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[int64] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[uint8] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[float16] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[float32] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[float64] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_store_constant[bfloat16] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_load_store_same_ptr PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_f8_xf16_roundtrip[out_dtype0-in_dtype0] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_f8_xf16_roundtrip[out_dtype0-in_dtype1] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_f8_xf16_roundtrip[out_dtype1-in_dtype0] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_f8_xf16_roundtrip[out_dtype1-in_dtype1] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_f16_to_f8_rounding[in_dtype0] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int8-32] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int8-64] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int8-128] FAILED [ 75%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int8-512] PASSED [ 75%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int16-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int16-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int16-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int16-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int32-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int32-64] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int32-128] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int32-512] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int64-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int64-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int64-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-int64-512] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint8-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint8-64] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint8-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint8-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint16-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint16-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint16-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint16-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint32-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint32-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint32-128] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint32-512] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint64-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint64-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint64-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint64-512] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float16-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float16-64] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float16-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float16-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float32-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float32-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float32-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float32-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float64-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float64-64] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float64-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-float64-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-bfloat16-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-bfloat16-64] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-bfloat16-128] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[min-bfloat16-512] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int8-32] PASSED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int8-64] FAILED [ 76%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int8-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int8-512] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int16-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int16-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int16-128] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int16-512] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int32-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int32-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int32-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int32-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int64-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int64-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int64-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-int64-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint8-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint8-64] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint8-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint8-512] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint16-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint16-64] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint16-128] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint16-512] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint32-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint32-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint32-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint32-512] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint64-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint64-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint64-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint64-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float16-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float16-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float16-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float16-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float32-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float32-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float32-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float32-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float64-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float64-64] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float64-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-float64-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-bfloat16-32] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-bfloat16-64] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-bfloat16-128] PASSED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[max-bfloat16-512] FAILED [ 77%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-128] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-512] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-32] PASSED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-64] FAILED [ 78%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-128] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-512] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int8-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int8-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int8-128] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int8-512] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int16-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int16-64] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int16-128] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int16-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int32-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int32-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int32-128] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int32-512] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int64-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int64-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int64-128] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int64-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint8-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint8-64] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint8-128] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint8-512] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint16-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint16-64] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint16-128] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint16-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint32-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint32-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint32-128] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint32-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint64-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint64-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint64-128] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint64-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float16-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float16-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float16-128] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float16-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float32-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float32-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float32-128] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float32-512] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float64-32] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float64-64] PASSED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float64-128] FAILED [ 79%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float64-512] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-bfloat16-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-bfloat16-64] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-bfloat16-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-bfloat16-512] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int8-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int8-64] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int8-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int8-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int16-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int16-64] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int16-128] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int16-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-64] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int64-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int64-64] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int64-128] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int64-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-64] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint16-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint16-64] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint16-128] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint16-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint32-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint32-64] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint32-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint32-512] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint64-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint64-64] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint64-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint64-512] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float16-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float16-64] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float16-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float16-512] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float32-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float32-64] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float32-128] FAILED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float32-512] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float64-32] PASSED [ 80%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float64-64] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float64-128] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float64-512] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-bfloat16-32] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-bfloat16-64] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-bfloat16-128] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-bfloat16-512] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-int8-shape0-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-int8-shape1-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int8-shape2-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-int16-shape3-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-int16-shape4-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int16-shape5-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-int32-shape6-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-int32-shape7-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int32-shape8-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-int64-shape9-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-int64-shape10-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int64-shape11-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-uint8-shape12-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-uint8-shape13-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint8-shape14-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-uint16-shape15-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-uint16-shape16-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint16-shape17-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-uint32-shape18-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-uint32-shape19-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint32-shape20-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-uint64-shape21-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-uint64-shape22-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint64-shape23-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float16-shape24-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float16-shape25-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float16-shape26-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape27-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape28-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape29-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float64-shape30-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float64-shape31-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float64-shape32-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-bfloat16-shape33-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-bfloat16-shape34-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-bfloat16-shape35-1] FAILED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape36-0] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape37-1] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape38-0] PASSED [ 81%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape39-1] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape40-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape41-1] FAILED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape42-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape43-1] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape44-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape45-1] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape46-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape47-1] FAILED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape48-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape49-1] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape50-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape51-1] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape52-0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape53-1] FAILED [ 82%]
 python/test/unit/language/test_core_amd.py::test_permute[float16-shape0-perm0] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_permute[float16-shape1-perm1] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_permute[float32-shape2-perm2] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_permute[float32-shape3-perm3] SKIPPED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-none-True-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-none-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-none-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-trans-True-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-trans-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-trans-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-add-matrix-True-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-add-matrix-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-2-False-False-add-matrix-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-none-True-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-none-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-none-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-trans-True-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-trans-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-trans-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-add-matrix-True-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-add-matrix-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-2-False-False-add-matrix-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-True-none-False-int8] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-True-none-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-True-none-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-False-none-False-int8] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-False-none-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-False-none-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-True-none-False-int8] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-True-none-False-float16] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-True-none-False-float32] PASSED [ 82%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-False-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-False-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-False-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-True-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-True-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-True-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-False-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-False-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-True-False-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-True-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-True-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-True-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-False-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-False-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-32-2-False-False-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-True-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-True-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-True-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-False-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-False-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-False-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-True-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-True-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-True-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-False-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-False-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-False-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-True-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-True-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-True-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-False-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-False-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-True-False-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-True-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-True-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-True-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-False-none-True-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-False-none-True-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-128-64-2-False-False-none-True-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-True-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-True-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-True-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-False-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-False-none-False-float16] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-False-none-False-float32] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-True-none-False-int8] PASSED [ 83%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-True-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-True-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-False-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-False-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-False-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-True-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-True-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-True-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-False-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-False-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-True-False-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-True-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-True-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-True-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-False-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-False-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-32-2-False-False-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-True-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-True-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-True-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-False-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-False-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-False-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-True-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-True-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-True-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-False-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-False-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-False-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-True-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-True-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-True-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-False-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-False-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-True-False-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-True-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-True-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-True-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-False-none-True-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-False-none-True-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[128-32-64-2-False-False-none-True-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-True-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-True-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-True-none-False-float32] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-False-none-False-int8] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-False-none-False-float16] PASSED [ 84%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-False-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-True-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-True-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-True-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-False-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-False-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-False-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-True-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-True-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-True-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-False-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-False-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-True-False-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-True-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-True-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-True-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-False-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-False-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-32-4-False-False-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-True-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-True-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-True-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-False-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-False-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-False-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-True-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-True-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-True-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-False-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-False-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-False-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-True-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-True-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-True-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-False-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-False-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-True-False-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-True-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-True-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-True-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-False-none-True-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-False-none-True-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-4-False-False-none-True-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-True-none-False-int8] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-True-none-False-float16] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-True-none-False-float32] PASSED [ 85%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-False-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-False-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-False-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-True-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-True-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-True-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-False-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-False-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-False-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-True-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-True-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-True-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-False-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-False-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-True-False-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-True-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-True-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-True-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-False-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-False-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-128-4-False-False-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-True-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-True-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-True-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-False-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-False-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-False-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-True-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-True-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-True-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-False-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-False-none-False-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-False-none-False-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-True-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-True-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-True-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-False-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-False-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-True-False-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-True-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-True-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-True-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-False-none-True-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-False-none-True-float16] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-256-4-False-False-none-True-float32] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-True-none-False-int8] PASSED [ 86%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-True-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-True-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-False-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-False-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-False-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-True-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-True-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-True-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-False-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-False-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-False-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-True-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-True-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-True-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-False-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-False-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-True-False-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-True-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-True-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-True-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-False-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-False-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-32-4-False-False-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-True-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-True-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-True-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-False-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-False-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-False-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-True-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-True-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-True-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-False-none-False-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-False-none-False-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-False-none-False-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-True-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-True-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-True-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-False-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-False-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-True-False-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-True-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-True-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-True-none-True-float32] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-False-none-True-int8] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-False-none-True-float16] PASSED [ 87%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-64-4-False-False-none-True-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-True-none-False-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-True-none-False-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-True-none-False-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-False-none-False-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-False-none-False-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-False-none-False-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-True-none-False-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-True-none-False-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-True-none-False-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-False-none-False-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-False-none-False-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-False-none-False-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-True-none-True-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-True-none-True-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-True-none-True-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-False-none-True-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-False-none-True-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-True-False-none-True-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-True-none-True-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-True-none-True-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-True-none-True-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-False-none-True-int8] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-False-none-True-float16] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[64-64-128-4-False-False-none-True-float32] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-False-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-False-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-False-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-False-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-False-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-False-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-False-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-False-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-False-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-False-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-False-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-False-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-True-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-True-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-True-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-True-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-True-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-True-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-True-int80] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-True-float160] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-True-float320] PASSED [ 88%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-True-int80] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-True-float160] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-True-float320] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-False-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-False-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-False-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-False-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-False-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-False-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-False-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-False-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-False-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-False-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-False-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-False-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-True-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-True-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-True-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-True-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-True-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-True-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-True-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-True-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-True-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-True-int80] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-True-float160] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-True-float320] SKIPPED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-True-none-False-int8] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-True-none-False-float16] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-True-none-False-float32] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-False-none-False-int8] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-False-none-False-float16] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-False-none-False-float32] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-True-none-False-int8] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-True-none-False-float16] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-True-none-False-float32] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-False-none-False-int8] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-False-none-False-float16] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-False-none-False-float32] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-True-none-True-int8] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-True-none-True-float16] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-True-none-True-float32] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-False-none-True-int8] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-False-none-True-float16] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-True-False-none-True-float32] PASSED [ 89%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-True-none-True-int8] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-True-none-True-float16] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-True-none-True-float32] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-False-none-True-int8] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-False-none-True-float16] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-32-64-1-False-False-none-True-float32] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-False-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-False-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-False-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-False-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-False-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-False-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-False-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-False-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-False-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-False-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-False-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-False-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-True-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-True-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-True-none-True-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-True-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-True-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-True-False-none-True-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-True-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-True-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-True-none-True-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-True-int81] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-True-float161] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[32-128-64-2-False-False-none-True-float321] PASSED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-False-int81] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-False-float161] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-False-float321] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-False-int81] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-False-float161] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-False-float321] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-False-int81] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-False-float161] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-False-float321] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-False-int81] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-False-float161] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-False-float321] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-True-int81] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-True-float161] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-True-none-True-float321] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-True-int81] SKIPPED [ 90%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-True-float161] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-True-False-none-True-float321] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-True-int81] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-True-float161] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-True-none-True-float321] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-True-int81] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-True-float161] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[64-128-128-2-False-False-none-True-float321] SKIPPED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-True-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-True-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-True-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-False-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-False-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-False-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-True-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-True-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-True-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-False-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-False-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-False-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-True-none-True-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-True-none-True-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-True-none-True-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-False-none-True-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-False-none-True-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-True-False-none-True-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-True-none-True-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-True-none-True-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-True-none-True-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-False-none-True-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-False-none-True-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-8-False-False-none-True-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-True-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-True-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-True-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-False-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-False-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-False-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-True-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-True-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-True-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-False-none-False-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-False-none-False-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-False-none-False-float32] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-True-none-True-int8] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-True-none-True-float16] PASSED [ 91%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-True-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-False-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-False-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-True-False-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-True-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-True-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-True-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-False-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-False-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[32-256-32-2-False-False-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-True-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-True-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-True-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-False-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-False-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-False-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-True-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-True-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-True-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-False-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-False-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-False-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-True-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-True-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-True-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-False-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-False-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-True-False-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-True-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-True-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-True-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-False-none-True-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-False-none-True-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[64-32-32-2-False-False-none-True-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-True-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-True-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-True-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-False-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-False-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-False-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-True-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-True-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-True-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-False-none-False-int8] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-False-none-False-float16] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-False-none-False-float32] PASSED [ 92%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-True-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-True-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-True-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-False-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-False-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-True-False-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-True-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-True-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-True-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-False-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-False-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-2-False-False-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-True-none-False-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-True-none-False-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-True-none-False-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-False-none-False-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-False-none-False-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-False-none-False-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-True-none-False-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-True-none-False-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-True-none-False-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-False-none-False-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-False-none-False-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-False-none-False-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-True-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-True-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-True-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-False-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-False-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-True-False-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-True-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-True-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-True-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-False-none-True-int8] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-False-none-True-float16] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_dot[256-32-32-4-False-False-none-True-float32] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[int8] PASSED       [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[int16] PASSED      [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[int32] PASSED      [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[int64] PASSED      [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[float16] PASSED    [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[float32] PASSED    [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[float64] PASSED    [ 93%]
 python/test/unit/language/test_core_amd.py::test_full[bfloat16] PASSED   [ 93%]
 python/test/unit/language/test_core_amd.py::test_gemm[64-32-128-4-64-32-64] PASSED [ 93%]
 python/test/unit/language/test_core_amd.py::test_arange[0] PASSED        [ 93%]
 python/test/unit/language/test_core_amd.py::test_arange[1] PASSED        [ 94%]
 python/test/unit/language/test_core_amd.py::test_arange[7] PASSED        [ 94%]
 python/test/unit/language/test_core_amd.py::test_arange[16] PASSED       [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-128-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-128-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-128-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-128-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-128-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-512-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-512-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-512-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-512-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bool-512-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-128-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-128-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-128-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-128-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-128-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-512-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-512-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-512-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-512-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int8-512-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-128-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-128-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-128-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-128-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-128-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-512-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-512-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-512-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-512-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int16-512-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-128-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-128-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-128-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-128-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-128-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-512-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-512-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-512-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-512-3] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int32-512-4] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-128-0] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-128-1] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-128-2] PASSED [ 94%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-128-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-128-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-512-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-512-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-512-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-512-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[int64-512-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-128-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-128-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-128-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-128-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-128-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-512-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-512-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-512-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-512-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[uint8-512-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-128-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-128-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-128-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-128-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-128-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-512-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-512-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-512-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-512-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float16-512-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-128-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-128-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-128-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-128-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-128-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-512-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-512-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-512-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-512-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float32-512-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-128-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-128-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-128-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-128-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-128-4] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-512-0] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-512-1] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-512-2] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-512-3] PASSED [ 95%]
 python/test/unit/language/test_core_amd.py::test_masked_load[float64-512-4] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-128-0] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-128-1] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-128-2] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-128-3] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-128-4] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-512-0] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-512-1] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-512-2] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-512-3] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load[bfloat16-512-4] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load_shared_memory[dtype0] SKIPPED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load_shared_memory[dtype1] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_masked_load_shared_memory[dtype2] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_default PASSED          [ 96%]
 python/test/unit/language/test_core_amd.py::test_noop PASSED             [ 96%]
 python/test/unit/language/test_core_amd.py::test_pointer_arguments[cuda] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_pointer_arguments[cpu] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_pointer_arguments[cpu_pinned] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[-1-i32] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[0-i32] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[-2147483648-i32] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[2147483647-i32] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[2147483648-i64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[4294967295-i64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[4294967296-i64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[9223372036854775807-i64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[-9223372036854775808-i64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[9223372036854775808-u64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization[18446744073709551615-u64] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization_overflow[18446744073709551615-False] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization_overflow[18446744073709551616-True] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization_overflow[-9223372036854775808-False] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_value_specialization_overflow[-9223372036854775809-True] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-+] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False--] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-*] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-/] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-%] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-<] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False->] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-<<] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False->>] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-&] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-^] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-False-|] PASSED [ 96%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-+] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True--] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-*] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-/] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-%] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-<] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True->] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-<<] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True->>] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-&] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-^] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[True-True-|] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-+] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False--] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-*] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-/] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-%] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-<] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False->] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-<<] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False->>] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-&] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-^] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-False-|] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-+] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True--] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-*] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-/] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-%] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-<] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True->] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-<<] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True->>] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-&] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-^] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_bin_op_constexpr[False-True-|] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_constexpr_shape PASSED  [ 97%]
 python/test/unit/language/test_core_amd.py::test_constexpr_scalar_shape PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_call PASSED             [ 97%]
 python/test/unit/language/test_core_amd.py::test_if[if] PASSED           [ 97%]
 python/test/unit/language/test_core_amd.py::test_if[if_exp] PASSED       [ 97%]
 python/test/unit/language/test_core_amd.py::test_num_warps_pow2 PASSED   [ 97%]
 python/test/unit/language/test_core_amd.py::test_math_tensor[int32-math.ffs-] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_math_tensor[float32-math.log2-] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_math_tensor[float32-math.scalbn-] SKIPPED [ 97%]
 python/test/unit/language/test_core_amd.py::test_math_tensor[float32-math.pow-/root/triton/python/triton/language/../third_party/rocm/lib/bitcode/cuda2gcn.bc] PASSED [ 97%]
 python/test/unit/language/test_core_amd.py::test_math_tensor[float64-math.pow_dtype-/root/triton/python/triton/language/../third_party/rocm/lib/bitcode/cuda2gcn.bc] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_math_tensor[float64-math.norm4d-] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_math_scalar[float32-math.pow-] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_math_scalar[float64-math.pow_dtype-] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_math_scalar[float64-math.pow-/root/triton/python/triton/language/../third_party/rocm/lib/bitcode/cuda2gcn.bc] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[34359738368-34359738388-1] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[34359738368-34359738388-2] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[34359738368-34359738388-3] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[15--16--1] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[15--16--2] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[15--16--3] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[-18--22--1] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_for_iv[22-18--1] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_if_else PASSED          [ 98%]
 python/test/unit/language/test_core_amd.py::test_if_return PASSED        [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[True-True-True] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[True-True-False] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[True-False-True] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[True-False-False] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[False-True-True] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[False-True-False] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[False-False-True] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_nested_if_else_return[False-False-False] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_while PASSED            [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout0-float16-shape0] SKIPPED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout1-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout2-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout3-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout4-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout5-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout0-src_layout6-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout0-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout1-float16-shape0] SKIPPED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout2-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout3-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout4-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout5-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout1-src_layout6-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout0-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout1-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout2-float16-shape0] SKIPPED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout3-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout4-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout5-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout2-src_layout6-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout0-float16-shape0] PASSED [ 98%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout1-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout2-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout3-float16-shape0] SKIPPED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout4-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout5-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout3-src_layout6-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout0-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout1-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout2-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout3-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout4-float16-shape0] SKIPPED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout5-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout4-src_layout6-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout0-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout1-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout2-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout3-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout4-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout5-float16-shape0] SKIPPED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout5-src_layout6-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout0-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout1-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout2-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout3-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout4-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout5-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_convert2d[dst_layout6-src_layout6-float16-shape0] SKIPPED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout0-128-32] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout0-128-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout0-32-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout0-64-64] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout1-128-32] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout1-128-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout1-32-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[0-src_layout1-64-64] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout0-128-32] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout0-128-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout0-32-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout0-64-64] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout1-128-32] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout1-128-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout1-32-128] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_reduce_layouts[1-src_layout1-64-64] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_make_range[dst_layout0-src_layout0-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_make_range[dst_layout0-src_layout1-float16-shape0] PASSED [ 99%]
 python/test/unit/language/test_core_amd.py::test_load_scalar_with_mask PASSED [100%]

 =================================== FAILURES ===================================
 _________________________ test_reduce1d[min-int8-128] __________________________

 op = 'min', dtype_str = 'int8', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -128, array([-124], dtype=int8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4
 E           Max relative difference: 0.03225806
 E            x: array(-128, dtype=int8)
 E            y: array([-124], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-int16-128] _________________________

 op = 'min', dtype_str = 'int16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -32695, array([-30994], dtype=int16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1701
 E           Max relative difference: 0.05488159
 E            x: array(-32695, dtype=int16)
 E            y: array([-30994], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-int32-64] __________________________

 op = 'min', dtype_str = 'int32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -2142656538, array([-2000109605], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 142546933
 E           Max relative difference: 0.07126956
 E            x: array(-2142656538, dtype=int32)
 E            y: array([-2000109605], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-int32-512] _________________________

 op = 'min', dtype_str = 'int32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -2142656538, array([-2139159289], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3497249
 E           Max relative difference: 0.00163487
 E            x: array(-2142656538, dtype=int32)
 E            y: array([-2139159289], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-int64-128] _________________________

 op = 'min', dtype_str = 'int64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -8933828387023419676, array([-8685520378400575711]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 248308008622843965
 E           Max relative difference: 0.02858873
 E            x: array(-8933828387023419676)
 E            y: array([-8685520378400575711])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-int64-512] _________________________

 op = 'min', dtype_str = 'int64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -9182695313683611414, array([-9181561913479119462]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1133400204491952
 E           Max relative difference: 0.00012344
 E            x: array(-9182695313683611414)
 E            y: array([-9181561913479119462])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-uint8-64] __________________________

 op = 'min', dtype_str = 'uint8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 4, array([7], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3
 E           Max relative difference: 0.42857143
 E            x: array(4, dtype=uint8)
 E            y: array([7], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[min-uint8-128] _________________________

 op = 'min', dtype_str = 'uint8', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 1, array([4], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3
 E           Max relative difference: 0.75
 E            x: array(1, dtype=uint8)
 E            y: array([4], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-uint16-128] _________________________

 op = 'min', dtype_str = 'uint16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 73, array([1774], dtype=uint16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1701
 E           Max relative difference: 0.95885006
 E            x: array(73, dtype=uint16)
 E            y: array([1774], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-uint32-512] _________________________

 op = 'min', dtype_str = 'uint32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 4827110, array([8324359], dtype=uint32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3497249
 E           Max relative difference: 0.42012232
 E            x: array(4827110, dtype=uint32)
 E            y: array([8324359], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-uint64-128] _________________________

 op = 'min', dtype_str = 'uint64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 289543649831356132, array([537851658454200097], dtype=uint64))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 248308008622843965
 E           Max relative difference: 0.46166634
 E            x: array(289543649831356132, dtype=uint64)
 E            y: array([537851658454200097], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-uint64-512] _________________________

 op = 'min', dtype_str = 'uint64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 40676723171164394, array([41810123375656346], dtype=uint64))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1133400204491952
 E           Max relative difference: 0.02710827
 E            x: array(40676723171164394, dtype=uint64)
 E            y: array([41810123375656346], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-float16-64] _________________________

 op = 'min', dtype_str = 'float16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -2.96, array([-1.8545], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.106
 E           Max relative difference: 0.5967
 E            x: array(-2.96, dtype=float16)
 E            y: array([-1.8545], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-float16-128] ________________________

 op = 'min', dtype_str = 'float16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.4, array([-2.96], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.4395
 E           Max relative difference: 0.1484
 E            x: array(-3.4, dtype=float16)
 E            y: array([-2.96], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-float32-128] ________________________

 op = 'min', dtype_str = 'float32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.3999307, array([-2.9601226], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.43980813
 E           Max relative difference: 0.14857768
 E            x: array(-3.399931, dtype=float32)
 E            y: array([-2.960123], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-float64-128] ________________________

 op = 'min', dtype_str = 'float64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.3999307792224305, array([-2.96012262]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.43980816
 E           Max relative difference: 0.14857768
 E            x: array(-3.399931)
 E            y: array([-2.960123])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[min-bfloat16-64] ________________________

 op = 'min', dtype_str = 'bfloat16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -2.953125, array([-1.8515625], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1015625
 E           Max relative difference: 0.5949367
 E            x: array(-2.953125, dtype=float32)
 E            y: array([-1.851562], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[min-bfloat16-128] ________________________

 op = 'min', dtype_str = 'bfloat16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.390625, array([-2.953125], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.4375
 E           Max relative difference: 0.14814815
 E            x: array(-3.390625, dtype=float32)
 E            y: array([-2.953125], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 __________________________ test_reduce1d[max-int8-64] __________________________

 op = 'max', dtype_str = 'int8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 124, array([97], dtype=int8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 27
 E           Max relative difference: 0.27835052
 E            x: array(124, dtype=int8)
 E            y: array([97], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[max-int16-128] _________________________

 op = 'max', dtype_str = 'int16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 31985, array([31761], dtype=int16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 224
 E           Max relative difference: 0.00705267
 E            x: array(31985, dtype=int16)
 E            y: array([31761], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[max-int32-512] _________________________

 op = 'max', dtype_str = 'int32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 2146553285, array([2145576731], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 976554
 E           Max relative difference: 0.00045515
 E            x: array(2146553285, dtype=int32)
 E            y: array([2145576731], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[max-int64-512] _________________________

 op = 'max', dtype_str = 'int64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 9219376159251228126, array([9002134033872679366]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 217242125378548760
 E           Max relative difference: 0.02413229
 E            x: array(9219376159251228126)
 E            y: array([9002134033872679366])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[max-uint8-64] __________________________

 op = 'max', dtype_str = 'uint8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 252, array([225], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 27
 E           Max relative difference: 0.12
 E            x: array(252, dtype=uint8)
 E            y: array([225], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[max-uint16-64] _________________________

 op = 'max', dtype_str = 'uint16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 64753, array([64529], dtype=uint16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 224
 E           Max relative difference: 0.00347131
 E            x: array(64753, dtype=uint16)
 E            y: array([64529], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[max-uint16-128] _________________________

 op = 'max', dtype_str = 'uint16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 64753, array([64529], dtype=uint16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 224
 E           Max relative difference: 0.00347131
 E            x: array(64753, dtype=uint16)
 E            y: array([64529], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[max-uint64-512] _________________________

 op = 'max', dtype_str = 'uint64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 18442748196106003934, array([18423896903575150537], dtype=uint64))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 18851292530853397
 E           Max relative difference: 0.0010232
 E            x: array(18442748196106003934, dtype=uint64)
 E            y: array([18423896903575150537], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[max-float16-512] ________________________

 op = 'max', dtype_str = 'float16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.346, array([3.215], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.1309
 E           Max relative difference: 0.0407
 E            x: array(3.346, dtype=float16)
 E            y: array([3.215], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[max-float32-512] ________________________

 op = 'max', dtype_str = 'float32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.3451574, array([2.871127], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.4740305
 E           Max relative difference: 0.16510259
 E            x: array(3.345157, dtype=float32)
 E            y: array([2.871127], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[max-float64-512] ________________________

 op = 'max', dtype_str = 'float64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.3451573949029427, array([3.21471706]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.13044034
 E           Max relative difference: 0.04057599
 E            x: array(3.345157)
 E            y: array([3.214717])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[max-bfloat16-64] ________________________

 op = 'max', dtype_str = 'bfloat16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.34375, array([2.15625], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1875
 E           Max relative difference: 0.5507246
 E            x: array(3.34375, dtype=float32)
 E            y: array([2.15625], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[max-bfloat16-512] ________________________

 op = 'max', dtype_str = 'bfloat16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                np.testing.assert_equal(x[z_ref], x[z_tri])
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1039: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.34375, array([3.203125], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.140625
 E           Max relative difference: 0.04390244
 E            x: array(3.34375, dtype=float32)
 E            y: array([3.203125], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 __________________________ test_reduce1d[sum-int8-64] __________________________

 op = 'sum', dtype_str = 'int8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8aab90>, array(-78, dtype=int8), array([-66], dtype=int8))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 12
 E           Max relative difference: 0.18181818
 E            x: array(-78, dtype=int8)
 E            y: array([-66], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int8-128] __________________________

 op = 'sum', dtype_str = 'int8', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8a9480>, array(-107, dtype=int8), array([-26], dtype=int8))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 81
 E           Max relative difference: 3.11538462
 E            x: array(-107, dtype=int8)
 E            y: array([-26], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int8-512] __________________________

 op = 'sum', dtype_str = 'int8', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bdf30>, array(-22, dtype=int8), array([36], dtype=int8))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 58
 E           Max relative difference: 1.61111111
 E            x: array(-22, dtype=int8)
 E            y: array([36], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int16-64] __________________________

 op = 'sum', dtype_str = 'int16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8aa290>, array(-29407, dtype=int16), array([9912], dtype=int16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 26217
 E           Max relative difference: 2.64497579
 E            x: array(-29407, dtype=int16)
 E            y: array([9912], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int16-128] _________________________

 op = 'sum', dtype_str = 'int16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8a96c0>, array(-17919, dtype=int16), array([-13800], dtype=int16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4119
 E           Max relative difference: 0.29847826
 E            x: array(-17919, dtype=int16)
 E            y: array([-13800], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int16-512] _________________________

 op = 'sum', dtype_str = 'int16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8ab520>, array(-6584, dtype=int16), array([-20552], dtype=int16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 13968
 E           Max relative difference: 0.67964188
 E            x: array(-6584, dtype=int16)
 E            y: array([-20552], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int32-64] __________________________

 op = 'sum', dtype_str = 'int32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8a9480>, array(-160447564, dtype=int32), array([-883732716], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 723285152
 E           Max relative difference: 0.81844334
 E            x: array(-160447564, dtype=int32)
 E            y: array([-883732716], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int32-128] _________________________

 op = 'sum', dtype_str = 'int32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bdf30>, array(1229457160, dtype=int32), array([1908174762], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 678717602
 E           Max relative difference: 0.35568943
 E            x: array(1229457160, dtype=int32)
 E            y: array([1908174762], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int32-512] _________________________

 op = 'sum', dtype_str = 'int32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bec20>, array(1785885296, dtype=int32), array([-2124286430], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 384795570
 E           Max relative difference: 0.18114109
 E            x: array(1785885296, dtype=int32)
 E            y: array([-2124286430], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int64-64] __________________________

 op = 'sum', dtype_str = 'int64', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8ab5b0>, array(-5543674200944337546), array([-5258876914028064716]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 284797286916272830
 E           Max relative difference: 0.05415553
 E            x: array(-5543674200944337546)
 E            y: array([-5258876914028064716])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int64-128] _________________________

 op = 'sum', dtype_str = 'int64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bca60>, array(6579261182149427719), array([-7410345778827245330]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4457137112732878567
 E           Max relative difference: 0.60147492
 E            x: array(6579261182149427719)
 E            y: array([-7410345778827245330])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-int64-512] _________________________

 op = 'sum', dtype_str = 'int64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bdb40>, array(256239199074544210), array([7337259309851043010]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 7081020110776498800
 E           Max relative difference: 0.96507699
 E            x: array(256239199074544210)
 E            y: array([7337259309851043010])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-uint8-64] __________________________

 op = 'sum', dtype_str = 'uint8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0be8c0>, array(178, dtype=uint8), array([166], dtype=uint8))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 12
 E           Max relative difference: 0.07228916
 E            x: array(178, dtype=uint8)
 E            y: array([166], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-uint8-128] _________________________

 op = 'sum', dtype_str = 'uint8', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bfc70>, array(150, dtype=uint8), array([232], dtype=uint8))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 82
 E           Max relative difference: 0.35344828
 E            x: array(150, dtype=uint8)
 E            y: array([232], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-uint8-512] _________________________

 op = 'sum', dtype_str = 'uint8', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8a9b40>, array(236, dtype=uint8), array([74], dtype=uint8))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 94
 E           Max relative difference: 1.27027027
 E            x: array(236, dtype=uint8)
 E            y: array([74], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-uint16-64] _________________________

 op = 'sum', dtype_str = 'uint16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0be680>, array(36129, dtype=uint16), array([62346], dtype=uint16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 26217
 E           Max relative difference: 0.42050813
 E            x: array(36129, dtype=uint16)
 E            y: array([62346], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-uint16-128] _________________________

 op = 'sum', dtype_str = 'uint16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e7c70>, array(47617, dtype=uint16), array([56600], dtype=uint16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 8983
 E           Max relative difference: 0.15871025
 E            x: array(47617, dtype=uint16)
 E            y: array([56600], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-uint16-512] _________________________

 op = 'sum', dtype_str = 'uint16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bedd0>, array(58952, dtype=uint16), array([2634], dtype=uint16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 9218
 E           Max relative difference: 3.49962035
 E            x: array(58952, dtype=uint16)
 E            y: array([2634], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-uint32-64] _________________________

 op = 'sum', dtype_str = 'uint32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e5240>, array(4134519732, dtype=uint32), array([3411234580], dtype=uint32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 723285152
 E           Max relative difference: 0.21203032
 E            x: array(4134519732, dtype=uint32)
 E            y: array([3411234580], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-uint32-128] _________________________

 op = 'sum', dtype_str = 'uint32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c2a0bf010>, array(1229457160, dtype=uint32), array([1908174762], dtype=uint32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 678717602
 E           Max relative difference: 0.35568943
 E            x: array(1229457160, dtype=uint32)
 E            y: array([1908174762], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-uint32-512] _________________________

 op = 'sum', dtype_str = 'uint32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e4d30>, array(1785885296, dtype=uint32), array([2170680866], dtype=uint32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 384795570
 E           Max relative difference: 0.17726953
 E            x: array(1785885296, dtype=uint32)
 E            y: array([2170680866], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _________________________ test_reduce1d[sum-uint64-64] _________________________

 op = 'sum', dtype_str = 'uint64', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e4b80>, array(12903069872765214070, dtype=uint64), array([13187867159681486900], dtype=uint64))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 284797286916272830
 E           Max relative difference: 0.0215954
 E            x: array(12903069872765214070, dtype=uint64)
 E            y: array([13187867159681486900], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-uint64-128] _________________________

 op = 'sum', dtype_str = 'uint64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e5b40>, array(6579261182149427719, dtype=uint64), array([11036398294882306286], dtype=uint64))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4457137112732878567
 E           Max relative difference: 0.40385794
 E            x: array(6579261182149427719, dtype=uint64)
 E            y: array([11036398294882306286], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-uint64-512] _________________________

 op = 'sum', dtype_str = 'uint64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e56c0>, array(256239199074544210, dtype=uint64), array([7337259309851043010], dtype=uint64))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 7081020110776498800
 E           Max relative difference: 0.96507699
 E            x: array(256239199074544210, dtype=uint64)
 E            y: array([7337259309851043010], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float16-64] _________________________

 op = 'sum', dtype_str = 'float16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e6e60>, array(-0.9727, dtype=float16), array([-3.516], dtype=float16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2.543
 E           Max relative difference: 0.723
 E            x: array(-0.9727, dtype=float16)
 E            y: array([-3.516], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float16-128] ________________________

 op = 'sum', dtype_str = 'float16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e64d0>, array(15.69, dtype=float16), array([16.88], dtype=float16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1875
 E           Max relative difference: 0.0704
 E            x: array(15.69, dtype=float16)
 E            y: array([16.88], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float16-512] ________________________

 op = 'sum', dtype_str = 'float16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e49d0>, array(5.164, dtype=float16), array([19.86], dtype=float16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 14.695
 E           Max relative difference: 0.7397
 E            x: array(5.164, dtype=float16)
 E            y: array([19.86], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float32-64] _________________________

 op = 'sum', dtype_str = 'float32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e4ee0>, array(-0.9746847, dtype=float32), array([-3.513041], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2.5383563
 E           Max relative difference: 0.7225524
 E            x: array(-0.974685, dtype=float32)
 E            y: array([-3.513041], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float32-128] ________________________

 op = 'sum', dtype_str = 'float32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e7eb0>, array(15.681145, dtype=float32), array([9.412781], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 6.268364
 E           Max relative difference: 0.6659418
 E            x: array(15.681145, dtype=float32)
 E            y: array([9.412781], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float32-512] ________________________

 op = 'sum', dtype_str = 'float32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e5630>, array(5.160534, dtype=float32), array([25.73103], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 20.570496
 E           Max relative difference: 0.7994432
 E            x: array(5.160534, dtype=float32)
 E            y: array([25.73103], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float64-64] _________________________

 op = 'sum', dtype_str = 'float64', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e5ab0>, array(-0.97468467), array([-3.51304062]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2.53835595
 E           Max relative difference: 0.7225524
 E            x: array(-0.974685)
 E            y: array([-3.513041])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float64-128] ________________________

 op = 'sum', dtype_str = 'float64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e5000>, array(15.68114559), array([9.41278196]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 6.26836363
 E           Max relative difference: 0.66594166
 E            x: array(15.681146)
 E            y: array([9.412782])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-float64-512] ________________________

 op = 'sum', dtype_str = 'float64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e5240>, array(5.16053454), array([8.03491013]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2.87437559
 E           Max relative difference: 0.35773587
 E            x: array(5.160535)
 E            y: array([8.03491])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[sum-bfloat16-64] ________________________

 op = 'sum', dtype_str = 'bfloat16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e6d40>, array(-0.9765625, dtype=float32), array([1.53125], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2.5078125
 E           Max relative difference: 1.6377552
 E            x: array(-0.976562, dtype=float32)
 E            y: array([1.53125], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[sum-bfloat16-128] ________________________

 op = 'sum', dtype_str = 'bfloat16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e7490>, array(15.625, dtype=float32), array([9.375], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 6.25
 E           Max relative difference: 0.6666667
 E            x: array(15.625, dtype=float32)
 E            y: array([9.375], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[sum-bfloat16-512] ________________________

 op = 'sum', dtype_str = 'bfloat16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1032: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c3b8e6290>, array(5.125, dtype=float32), array([37.25], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 32.125
 E           Max relative difference: 0.8624161
 E            x: array(5.125, dtype=float32)
 E            y: array([37.25], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmin-int8-512] ________________________

 op = 'argmin', dtype_str = 'int8', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -128, array([-124], dtype=int8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4
 E           Max relative difference: 0.03225806
 E            x: array(-128, dtype=int8)
 E            y: array([-124], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmin-int16-64] ________________________

 op = 'argmin', dtype_str = 'int16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -30994, array([-28319], dtype=int16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2675
 E           Max relative difference: 0.09445955
 E            x: array(-30994, dtype=int16)
 E            y: array([-28319], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmin-int16-128] ________________________

 op = 'argmin', dtype_str = 'int16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -32695, array([-28319], dtype=int16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4376
 E           Max relative difference: 0.15452523
 E            x: array(-32695, dtype=int16)
 E            y: array([-28319], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmin-int32-512] ________________________

 op = 'argmin', dtype_str = 'int32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -2142656538, array([-2105014772], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 37641766
 E           Max relative difference: 0.01788195
 E            x: array(-2142656538, dtype=int32)
 E            y: array([-2105014772], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmin-uint8-64] ________________________

 op = 'argmin', dtype_str = 'uint8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 4, array([7], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3
 E           Max relative difference: 0.42857143
 E            x: array(4, dtype=uint8)
 E            y: array([7], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmin-uint8-512] ________________________

 op = 'argmin', dtype_str = 'uint8', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 1, array([4], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3
 E           Max relative difference: 0.75
 E            x: array(1, dtype=uint8)
 E            y: array([4], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmin-uint16-64] ________________________

 op = 'argmin', dtype_str = 'uint16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 1774, array([4449], dtype=uint16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 2675
 E           Max relative difference: 0.60125871
 E            x: array(1774, dtype=uint16)
 E            y: array([4449], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmin-uint16-128] _______________________

 op = 'argmin', dtype_str = 'uint16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 73, array([4449], dtype=uint16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 4376
 E           Max relative difference: 0.98359182
 E            x: array(73, dtype=uint16)
 E            y: array([4449], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmin-uint64-128] _______________________

 op = 'argmin', dtype_str = 'uint64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 289543649831356132, array([537851658454200097], dtype=uint64))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 248308008622843965
 E           Max relative difference: 0.46166634
 E            x: array(289543649831356132, dtype=uint64)
 E            y: array([537851658454200097], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmin-float16-128] _______________________

 op = 'argmin', dtype_str = 'float16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.4, array([-1.8545], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.546
 E           Max relative difference: 0.8335
 E            x: array(-3.4, dtype=float16)
 E            y: array([-1.8545], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmin-float64-128] _______________________

 op = 'argmin', dtype_str = 'float64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.3999307792224305, array([-2.96012262]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.43980816
 E           Max relative difference: 0.14857768
 E            x: array(-3.399931)
 E            y: array([-2.960123])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmin-bfloat16-64] _______________________

 op = 'argmin', dtype_str = 'bfloat16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -2.953125, array([-1.8515625], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1015625
 E           Max relative difference: 0.5949367
 E            x: array(-2.953125, dtype=float32)
 E            y: array([-1.851562], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmin-bfloat16-128] ______________________

 op = 'argmin', dtype_str = 'bfloat16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, -3.390625, array([-2.953125], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.4375
 E           Max relative difference: 0.14814815
 E            x: array(-3.390625, dtype=float32)
 E            y: array([-2.953125], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmax-int8-128] ________________________

 op = 'argmax', dtype_str = 'int8', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 124, array([104], dtype=int8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 20
 E           Max relative difference: 0.19230769
 E            x: array(124, dtype=int8)
 E            y: array([104], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmax-int8-512] ________________________

 op = 'argmax', dtype_str = 'int8', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 125, array([124], dtype=int8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1
 E           Max relative difference: 0.00806452
 E            x: array(125, dtype=int8)
 E            y: array([124], dtype=int8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-int16-512] ________________________

 op = 'argmax', dtype_str = 'int16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 32505, array([32287], dtype=int16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 218
 E           Max relative difference: 0.00675194
 E            x: array(32505, dtype=int16)
 E            y: array([32287], dtype=int16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmax-int32-64] ________________________

 op = 'argmax', dtype_str = 'int32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 1914194698, array([1423197210], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 490997488
 E           Max relative difference: 0.3449961
 E            x: array(1914194698, dtype=int32)
 E            y: array([1423197210], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-int32-128] ________________________

 op = 'argmax', dtype_str = 'int32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 2080411454, array([1991032693], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 89378761
 E           Max relative difference: 0.04489065
 E            x: array(2080411454, dtype=int32)
 E            y: array([1991032693], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-int32-512] ________________________

 op = 'argmax', dtype_str = 'int32', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 2146553285, array([2145576731], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 976554
 E           Max relative difference: 0.00045515
 E            x: array(2146553285, dtype=int32)
 E            y: array([2145576731], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-int64-512] ________________________

 op = 'argmax', dtype_str = 'int64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 9219376159251228126, array([9200524866720374729]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 18851292530853397
 E           Max relative difference: 0.00204894
 E            x: array(9219376159251228126)
 E            y: array([9200524866720374729])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ________________________ test_reduce1d[argmax-uint8-64] ________________________

 op = 'argmax', dtype_str = 'uint8', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 252, array([225], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 27
 E           Max relative difference: 0.12
 E            x: array(252, dtype=uint8)
 E            y: array([225], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-uint8-128] ________________________

 op = 'argmax', dtype_str = 'uint8', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 252, array([232], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 20
 E           Max relative difference: 0.0862069
 E            x: array(252, dtype=uint8)
 E            y: array([232], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-uint8-512] ________________________

 op = 'argmax', dtype_str = 'uint8', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 253, array([252], dtype=uint8))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1
 E           Max relative difference: 0.00396825
 E            x: array(253, dtype=uint8)
 E            y: array([252], dtype=uint8)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-uint16-512] _______________________

 op = 'argmax', dtype_str = 'uint16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 65273, array([65055], dtype=uint16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 218
 E           Max relative difference: 0.00335101
 E            x: array(65273, dtype=uint16)
 E            y: array([65055], dtype=uint16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-uint32-64] ________________________

 op = 'argmax', dtype_str = 'uint32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 4061678346, array([3570680858], dtype=uint32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 490997488
 E           Max relative difference: 0.13750809
 E            x: array(4061678346, dtype=uint32)
 E            y: array([3570680858], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-uint32-128] _______________________

 op = 'argmax', dtype_str = 'uint32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 4227895102, array([4138516341], dtype=uint32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 89378761
 E           Max relative difference: 0.02159681
 E            x: array(4227895102, dtype=uint32)
 E            y: array([4138516341], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-uint64-128] _______________________

 op = 'argmax', dtype_str = 'uint64', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 18225506070727455174, array([18158671194054521614], dtype=uint64))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 66834876672933560
 E           Max relative difference: 0.0036806
 E            x: array(18225506070727455174, dtype=uint64)
 E            y: array([18158671194054521614], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmax-float16-128] _______________________

 op = 'argmax', dtype_str = 'float16', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.346, array([2.172], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.174
 E           Max relative difference: 0.5405
 E            x: array(3.346, dtype=float16)
 E            y: array([2.172], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmax-float16-512] _______________________

 op = 'argmax', dtype_str = 'float16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.346, array([2.871], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.4746
 E           Max relative difference: 0.1653
 E            x: array(3.346, dtype=float16)
 E            y: array([2.871], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce1d[argmax-float32-64] _______________________

 op = 'argmax', dtype_str = 'float32', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.3451574, array([2.171257], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1739004
 E           Max relative difference: 0.5406547
 E            x: array(3.345157, dtype=float32)
 E            y: array([2.171257], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmax-float32-128] _______________________

 op = 'argmax', dtype_str = 'float32', shape = 128, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.3451574, array([2.171257], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1739004
 E           Max relative difference: 0.5406547
 E            x: array(3.345157, dtype=float32)
 E            y: array([2.171257], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmax-float64-512] _______________________

 op = 'argmax', dtype_str = 'float64', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.3451573949029427, array([2.87112692]))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.47403047
 E           Max relative difference: 0.16510258
 E            x: array(3.345157)
 E            y: array([2.871127])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmax-bfloat16-64] _______________________

 op = 'argmax', dtype_str = 'bfloat16', shape = 64, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.34375, array([2.15625], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.1875
 E           Max relative difference: 0.5507246
 E            x: array(3.34375, dtype=float32)
 E            y: array([2.15625], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce1d[argmax-bfloat16-512] ______________________

 op = 'argmax', dtype_str = 'bfloat16', shape = 512, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape",
                             [(op, dtype, shape)
                              for op in ['min', 'max', 'sum', 'argmin', 'argmax']
                              for dtype in dtypes_with_bfloat16
                              for shape in [32, 64, 128, 512]])
    def test_reduce1d(op, dtype_str, shape, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK: tl.constexpr):
            x = tl.load(X + tl.arange(0, BLOCK))
            tl.store(Z, GENERATE_TEST_HERE)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=0)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x, device=device)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        # numpy result
        z_dtype_str = 'int32' if op in ('argmin', 'argmax') else dtype_str
        z_tri_dtype_str = z_dtype_str
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
            z_tri_dtype_str = 'bfloat16'
        else:
            z_ref = numpy_op(x).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((1,), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK=shape)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
 >               np.testing.assert_equal(x[z_ref], x[z_tri])

 python/test/unit/language/test_core_amd.py:1037: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, 3.34375, array([2.859375], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.484375
 E           Max relative difference: 0.1693989
 E            x: array(3.34375, dtype=float32)
 E            y: array([2.859375], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _______________________ test_reduce2d[sum-int8-shape2-1] _______________________

 op = 'sum', dtype_str = 'int8', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c385137f0>, array([-2749], dtype=int32), array([-3738], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 989
 E           Max relative difference: 0.26457999
 E            x: array([-2749], dtype=int32)
 E            y: array([-3738], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce2d[min-int16-shape3-1] _______________________

 op = 'min', dtype_str = 'int16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([-32695], dtype=int32), array([-32364], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 331
 E           Max relative difference: 0.01022741
 E            x: array([-32695], dtype=int32)
 E            y: array([-32364], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce2d[sum-int16-shape5-1] _______________________

 op = 'sum', dtype_str = 'int16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c38512b00>, array([646106], dtype=int32), array([992578], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 346472
 E           Max relative difference: 0.34906274
 E            x: array([646106], dtype=int32)
 E            y: array([992578], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce2d[min-int32-shape6-1] _______________________

 op = 'min', dtype_str = 'int32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([-2142656538], dtype=int32), array([-2140975822], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1680716
 E           Max relative difference: 0.00078502
 E            x: array([-2142656538], dtype=int32)
 E            y: array([-2140975822], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce2d[sum-int32-shape8-1] _______________________

 op = 'sum', dtype_str = 'int32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c38511510>, array([-1444101757], dtype=int32), array([226532592], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1670634349
 E           Max relative difference: 7.3748079
 E            x: array([-1444101757], dtype=int32)
 E            y: array([226532592], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ______________________ test_reduce2d[sum-int64-shape11-1] ______________________

 op = 'sum', dtype_str = 'int64', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c10f748b0>, array([-170495476646359235]), array([3037587888588208864]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3208083365234568099
 E           Max relative difference: 1.05612857
 E            x: array([-170495476646359235])
 E            y: array([3037587888588208864])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[min-uint16-shape15-1] ______________________

 op = 'min', dtype_str = 'uint16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([73], dtype=int32), array([127], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 54
 E           Max relative difference: 0.42519685
 E            x: array([73], dtype=int32)
 E            y: array([127], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[max-uint16-shape16-1] ______________________

 op = 'max', dtype_str = 'uint16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([65521], dtype=int32), array([65242], dtype=int32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 279
 E           Max relative difference: 0.00427639
 E            x: array([65521], dtype=int32)
 E            y: array([65242], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-uint16-shape17-1] ______________________

 op = 'sum', dtype_str = 'uint16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c38512dd0>, array([34200538], dtype=int32), array([34547010], dtype=int32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 346472
 E           Max relative difference: 0.010029
 E            x: array([34200538], dtype=int32)
 E            y: array([34547010], dtype=int32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[min-uint32-shape18-1] ______________________

 op = 'min', dtype_str = 'uint32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([4827110], dtype=uint32), array([6507826], dtype=uint32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1680716
 E           Max relative difference: 0.25826075
 E            x: array([4827110], dtype=uint32)
 E            y: array([6507826], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-uint32-shape20-1] ______________________

 op = 'sum', dtype_str = 'uint32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c10f75360>, array([2850865539], dtype=uint32), array([1961247718], dtype=uint32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 889617821
 E           Max relative difference: 0.45359789
 E            x: array([2850865539], dtype=uint32)
 E            y: array([1961247718], dtype=uint32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-uint64-shape23-1] ______________________

 op = 'sum', dtype_str = 'uint64', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c38513370>, array([18276248597063192381], dtype=uint64), array([3037587888588208864], dtype=uint64))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 3208083365234568099
 E           Max relative difference: 1.05612857
 E            x: array([18276248597063192381], dtype=uint64)
 E            y: array([3037587888588208864], dtype=uint64)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[min-float16-shape24-1] _____________________

 op = 'min', dtype_str = 'float16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([-3.4], dtype=float16), array([-3.252], dtype=float16))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.1484
 E           Max relative difference: 0.04565
 E            x: array([-3.4], dtype=float16)
 E            y: array([-3.252], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-float16-shape26-1] _____________________

 op = 'sum', dtype_str = 'float16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c385124d0>, array([16.64], dtype=float16), array([-8.64], dtype=float16))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 25.28
 E           Max relative difference: 2.926
 E            x: array([16.64], dtype=float16)
 E            y: array([-8.64], dtype=float16)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[min-float32-shape27-1] _____________________

 op = 'min', dtype_str = 'float32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([-3.3999307], dtype=float32), array([-3.252068], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.14786267
 E           Max relative difference: 0.04546728
 E            x: array([-3.399931], dtype=float32)
 E            y: array([-3.252068], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[max-float32-shape28-1] _____________________

 op = 'max', dtype_str = 'float32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([3.7074769], dtype=float32), array([2.697435], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 1.010042
 E           Max relative difference: 0.37444535
 E            x: array([3.707477], dtype=float32)
 E            y: array([2.697435], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-float32-shape29-1] _____________________

 op = 'sum', dtype_str = 'float32', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c38510670>, array([16.647068], dtype=float32), array([70.16619], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 53.519123
 E           Max relative difference: 0.762748
 E            x: array([16.647068], dtype=float32)
 E            y: array([70.16619], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-float64-shape32-1] _____________________

 op = 'sum', dtype_str = 'float64', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c10f74790>, array([16.6470651]), array([50.39062963]))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 33.74356453
 E           Max relative difference: 0.66963967
 E            x: array([16.647065])
 E            y: array([50.39063])

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ____________________ test_reduce2d[min-bfloat16-shape33-1] _____________________

 op = 'min', dtype_str = 'bfloat16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([-3.390625], dtype=float32), array([-3.25], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.140625
 E           Max relative difference: 0.04326923
 E            x: array([-3.390625], dtype=float32)
 E            y: array([-3.25], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ____________________ test_reduce2d[max-bfloat16-shape34-1] _____________________

 op = 'max', dtype_str = 'bfloat16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([3.703125], dtype=float32), array([3.203125], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 0.5
 E           Max relative difference: 0.15609756
 E            x: array([3.703125], dtype=float32)
 E            y: array([3.203125], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 ____________________ test_reduce2d[sum-bfloat16-shape35-1] _____________________

 op = 'sum', dtype_str = 'bfloat16', shape = (1, 1024), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c38510ee0>, array([16.625], dtype=float32), array([-66.5], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 1 / 1 (100%)
 E           Max absolute difference: 83.125
 E           Max relative difference: 1.25
 E            x: array([16.625], dtype=float32)
 E            y: array([-66.5], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[min-float32-shape41-1] _____________________

 op = 'min', dtype_str = 'float32', shape = (4, 128), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([-3.3999307, -2.2161243, -2.759191 , -3.252068 ], dtype=float32), array([-3.3999307, -1.9640115, -2.759191 , -3.252068 ], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 1 / 4 (25%)
 E           Max absolute difference: 0.25211275
 E           Max relative difference: 0.12836622
 E            x: array([-3.399931, -2.216124, -2.759191, -3.252068], dtype=float32)
 E            y: array([-3.399931, -1.964012, -2.759191, -3.252068], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[max-float32-shape47-1] _____________________

 op = 'max', dtype_str = 'float32', shape = (4, 128), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
            np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)
        else:
            if op in ('argmin', 'argmax'):
                # argmin and argmax can have multiple valid indices.
                # so instead we compare the values pointed by indices
                z_ref_index = np.expand_dims(z_ref, axis=axis)
                z_tri_index = np.expand_dims(z_tri, axis=axis)
                z_ref_value = np.take_along_axis(x, z_ref_index, axis=axis)
                z_tri_value = np.take_along_axis(x, z_tri_index, axis=axis)
                np.testing.assert_equal(z_ref_value, z_tri_value)
            else:
 >               np.testing.assert_equal(z_ref, z_tri)

 python/test/unit/language/test_core_amd.py:1120: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<built-in function eq>, array([3.3451574, 2.052304 , 2.697435 , 3.2147171], dtype=float32), array([3.3451574, 1.9552755, 1.6326001, 3.2147171], dtype=float32))
 kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Arrays are not equal
 E           
 E           Mismatched elements: 2 / 4 (50%)
 E           Max absolute difference: 1.0648348
 E           Max relative difference: 0.6522325
 E            x: array([3.345157, 2.052304, 2.697435, 3.214717], dtype=float32)
 E            y: array([3.345157, 1.955276, 1.6326  , 3.214717], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 _____________________ test_reduce2d[sum-float32-shape53-1] _____________________

 op = 'sum', dtype_str = 'float32', shape = (4, 128), axis = 1, device = 'cuda'

    @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2)
    def test_reduce2d(op, dtype_str, shape, axis, device='cuda'):
        check_type_supported(dtype_str)  # bfloat16 on cc < 80 will not be tested
    
        # triton kernel
        @triton.jit
        def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, AXIS: tl.constexpr):
            range_m = tl.arange(0, BLOCK_M)
            range_n = tl.arange(0, BLOCK_N)
            x = tl.load(X + range_m[:, None] * BLOCK_N + range_n[None, :])
            z = GENERATE_TEST_HERE
            if AXIS == 1:
                tl.store(Z + range_m, z)
            else:
                tl.store(Z + range_n, z)
    
        kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.{op}(x, axis=AXIS)'})
        # input
        rs = RandomState(17)
        # limit the range of integers so that the sum does not overflow
        x = numpy_random(shape, dtype_str=dtype_str, rs=rs)
        x_tri = to_triton(x)
        numpy_op = {'sum': np.sum, 'max': np.max, 'min': np.min,
                    'argmin': np.argmin, 'argmax': np.argmax}[op]
        z_dtype_str = get_reduced_dtype(dtype_str, op)
        z_tri_dtype_str = z_dtype_str
        # numpy result
        if op not in ['argmin', 'argmax'] and dtype_str == 'bfloat16':
            z_dtype_str = 'float32'
            z_tri_dtype_str = 'bfloat16'
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
            # trunc mantissa for a fair comparison of accuracy
            z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
        else:
            z_ref = numpy_op(x, axis=axis).astype(getattr(np, z_dtype_str))
        # triton result
        z_tri = to_triton(numpy_random((shape[1 - axis],), dtype_str=z_dtype_str, rs=rs),
                          device=device, dst_type=z_tri_dtype_str)
        kernel[(1,)](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], AXIS=axis)
        z_tri = to_numpy(z_tri)
        # compare
        if op == 'sum':
 >           np.testing.assert_allclose(z_ref, z_tri, rtol=0.01)

 python/test/unit/language/test_core_amd.py:1109: 
 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

 args = (<function assert_allclose.<locals>.compare at 0x7f4c1086a830>, array([ 15.681145  ,  -0.02656412, -19.793098  ,   9.299051  ],
      dtype=float32), array([ 33.31166 ,   9.564828, -10.945043,  12.982364], dtype=float32))
 kwds = {'equal_nan': True, 'err_msg': '', 'header': 'Not equal to tolerance rtol=0.01, atol=0', 'verbose': True}

    @wraps(func)
    def inner(*args, **kwds):
        with self._recreate_cm():
 >           return func(*args, **kwds)
 E           AssertionError: 
 E           Not equal to tolerance rtol=0.01, atol=0
 E           
 E           Mismatched elements: 4 / 4 (100%)
 E           Max absolute difference: 17.630516
 E           Max relative difference: 1.0027773
 E            x: array([ 15.681145,  -0.026564, -19.793098,   9.299051], dtype=float32)
 E            y: array([ 33.31166 ,   9.564828, -10.945043,  12.982364], dtype=float32)

 /usr/lib/python3.10/contextlib.py:79: AssertionError
 =============================== warnings summary ===============================
 test/unit/language/test_core_amd.py: 38 warnings
  /root/triton/python/test/unit/language/test_core_amd.py:211: RuntimeWarning: overflow encountered in cast
    z_ref = z_ref.astype(dtype_z)

 test/unit/language/test_core_amd.py::test_atomic_rmw[add-uint32-min_neg]
 test/unit/language/test_core_amd.py::test_atomic_rmw[max-uint32-min_neg]
 test/unit/language/test_core_amd.py::test_atomic_rmw[min-uint32-min_neg]
  /root/triton/python/test/unit/language/test_core_amd.py:697: RuntimeWarning: overflow encountered in scalar negative
    x[idx] = -np.max(np.abs(x)) - 1

 test/unit/language/test_core_amd.py: 100 warnings
  /root/triton/python/test/unit/language/test_core_amd.py:1552: FutureWarning: `torch.testing.assert_allclose()` is deprecated since 1.12 and will be removed in a future release. Please use `torch.testing.assert_close()` instead. You can find detailed upgrade instructions in https://github.com/pytorch/pytorch/issues/61844.
    torch.testing.assert_allclose(output, reference_out)

 test/unit/language/test_core_amd.py::test_masked_load_shared_memory[dtype1]
 test/unit/language/test_core_amd.py::test_masked_load_shared_memory[dtype2]
  /root/triton/python/test/unit/language/test_core_amd.py:1606: FutureWarning: `torch.testing.assert_allclose()` is deprecated since 1.12 and will be removed in a future release. Please use `torch.testing.assert_close()` instead. You can find detailed upgrade instructions in https://github.com/pytorch/pytorch/issues/61844.
    torch.testing.assert_allclose(out, reference_out, atol=1e-2, rtol=0)

 -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
 =========================== short test summary info ============================
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-int8-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-int16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-int32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-int32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-int64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-int64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint8-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-uint64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-float16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-float16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-float32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-float64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-bfloat16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[min-bfloat16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-int8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-int16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-int32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-int64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-uint64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-float16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-float32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-float64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-bfloat16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[max-bfloat16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int8-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-int64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint8-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-uint64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-float64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[sum-bfloat16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int8-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-int32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint8-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-uint64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-float64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-bfloat16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmin-bfloat16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int8-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int8-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int32-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-int64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint8-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-uint64-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float16-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float32-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float32-128]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-float64-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-bfloat16-64]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce1d[argmax-bfloat16-512]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int8-shape2-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-int16-shape3-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int16-shape5-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-int32-shape6-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int32-shape8-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-int64-shape11-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-uint16-shape15-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[max-uint16-shape16-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint16-shape17-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-uint32-shape18-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint32-shape20-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-uint64-shape23-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-float16-shape24-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float16-shape26-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape27-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape28-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape29-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float64-shape32-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-bfloat16-shape33-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[max-bfloat16-shape34-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-bfloat16-shape35-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[min-float32-shape41-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[max-float32-shape47-1]
 FAILED python/test/unit/language/test_core_amd.py::test_reduce2d[sum-float32-shape53-1]
 SKIPPED [1] python/test/unit/language/test_core_amd.py:1137: Not supported: memory out of resource.
 SKIPPED [48] python/test/unit/language/test_core_amd.py:1257: Not supported: memory out of resource.
 SKIPPED [1] python/test/unit/language/test_core_amd.py:1562: test_masked_load_shared_memory[bfloat16] is only supported on AMDGPU
 SKIPPED [1] python/test/unit/language/test_core_amd.py:1893: math.scalbn has mismatch issues
 SKIPPED [7] python/test/unit/language/test_core_amd.py:2181: Skipped
 === 125 failed, 4407 passed, 58 skipped, 143 warnings in 1125.83s (0:18:45) ====