pashu123 · February 22, 2023 17:07
diff --git a/xyz.txt b/xyz.txt
 Args: {'output_path': '.', 'quant_device': 'cuda', 'batch_size': 2, 'bit_width': 8, 'conv_group_size': 16, 'linear_group_size': 16, 'layer_type': 'conv', 'weight_quant': 'group_scale_param', 'input_quant': 'group_dynamic_sym', 'float16': True, 'inline_ts_graph': False, 'onnx_export': False, 'make_fx': True, 'quantize': True}
 Moving model to cuda...
 Run model quantization...
 Quantize conv2d: conv
 Tracing to FX functional representation...
 FX graph model...
 graph():
    %arg0_1 : [#users=2] = placeholder[target=arg0_1]
    %_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0]
    %lift_fresh_copy : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant0,), kwargs = {})
    %view : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg0_1, [2, 2, 16, 5, 5]), kwargs = {})
    %abs_1 : [#users=1] = call_function[target=torch.ops.aten.abs](args = (%view,), kwargs = {})
    %max_1 : [#users=2] = call_function[target=torch.ops.aten.max](args = (%abs_1, 2, True), kwargs = {})
    %getitem : [#users=1] = call_function[target=operator.getitem](args = (%max_1, 0), kwargs = {})
    %getitem_1 : [#users=0] = call_function[target=operator.getitem](args = (%max_1, 1), kwargs = {})
    %view_1 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem, [2, 2, 1, 5, 5]), kwargs = {})
    %expand : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_1, [2, 2, 16, 5, 5]), kwargs = {})
    %clone : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%expand,), kwargs = {memory_format: torch.contiguous_format})
    %view_2 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%clone, [2, 32, 5, 5]), kwargs = {})
    %div : [#users=2] = call_function[target=torch.ops.aten.div](args = (%view_2, 128), kwargs = {})
    %_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1]
    %detach : [#users=1] = call_function[target=torch.ops.aten.detach](args = (%_tensor_constant1,), kwargs = {})
    %detach_1 : [#users=2] = call_function[target=torch.ops.aten.detach](args = (%detach,), kwargs = {})
    %div_1 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%arg0_1, %div), kwargs = {})
    %add : [#users=1] = call_function[target=torch.ops.aten.add](args = (%div_1, %detach_1), kwargs = {})
    %round_1 : [#users=1] = call_function[target=torch.ops.aten.round](args = (%add,), kwargs = {})
    %clamp : [#users=1] = call_function[target=torch.ops.aten.clamp](args = (%round_1, -128, 127), kwargs = {})
    %sub : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%clamp, %detach_1), kwargs = {})
    %mul : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub, %div), kwargs = {})
    %_tensor_constant2 : [#users=1] = get_attr[target=_tensor_constant2]
    %lift_fresh_copy_1 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant2,), kwargs = {})
    %_tensor_constant3 : [#users=1] = get_attr[target=_tensor_constant3]
    %lift_fresh_copy_2 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant3,), kwargs = {})
    %_tensor_constant4 : [#users=1] = get_attr[target=_tensor_constant4]
    %lift_fresh_copy_3 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant4,), kwargs = {})
    %_tensor_constant5 : [#users=1] = get_attr[target=_tensor_constant5]
    %expand_1 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%_tensor_constant5, [2, 2, 16, 3, 3]), kwargs = {})
    %clone_1 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%expand_1,), kwargs = {memory_format: torch.contiguous_format})
    %view_3 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%clone_1, [2, 32, 3, 3]), kwargs = {})
    %_tensor_constant6 : [#users=1] = get_attr[target=_tensor_constant6]
    %mul_1 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_tensor_constant6, %view_3), kwargs = {})
    %_tensor_constant7 : [#users=1] = get_attr[target=_tensor_constant7]
    %lift_fresh_copy_4 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant7,), kwargs = {})
    %_tensor_constant8 : [#users=1] = get_attr[target=_tensor_constant8]
    %lift_fresh_copy_5 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant8,), kwargs = {})
    %_tensor_constant9 : [#users=1] = get_attr[target=_tensor_constant9]
    %lift_fresh_copy_6 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant9,), kwargs = {})
    %_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
    %convolution : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%mul, %mul_1, %_param_constant0, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
    %_tensor_constant10 : [#users=1] = get_attr[target=_tensor_constant10]
    %lift_fresh_copy_7 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant10,), kwargs = {})
    %_tensor_constant11 : [#users=1] = get_attr[target=_tensor_constant11]
    %lift_fresh_copy_8 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant11,), kwargs = {})
    return convolution

 # Linalg IR

 #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, 0, d3, d4)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #map3 = affine_map<(d0, d1, d2, d3) -> ()>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d1)>
 module attributes {torch.debug_module_name = "_lambda"} {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @forward(%arg0: tensor<2x32x5x5xf16>) -> tensor<2x2x3x3xf16> {
    %cst = arith.constant dense<[4.827880e-02, 6.393430e-03]> : tensor<2xf16>
    %cst_0 = arith.constant dense<"0xD0285B67B6D9816E05B2B2AFDEADBC320D42510BF670057FA034A01AEFFE9666C02F2F4038CEE2E5E79C2F49400E467F8B7F775C187F8949FF81A009C4591DAD33C20FF7D356852E5CE1C863C80A81BB98B653D575158688471D25EE05D41EEEF81B108181FD716948E3B8253A436CA26CE0690437B089E6ED9CB7DAFF0FA1684BDD2BF70D12F5F25DB09DB77489812D50122CE520178481C0757FE17C075FBD9CEF8B47245CCA6721950E301C7F33CC727F88FBA7D3FB65BA7F6510C1D4A6972CAD53587181415FD3240E5E5D4A344678D70A3A8438639AEE75428A7FE143B86BF1B3C7E365DE25F5DA195C4D0E696E9A683A4A821D782926742C4414ECBAE18E20484E2B832DA3C294DCE2FFBC7F841F5E9251C0B703A8A2D681D3503B5A3BCD94983D7F7F988EF20FED3DAF8C07A54C0455053E4D8B5F757F98091C0495BF7BAE3E466B93ED81745C72C2F756ED04FB681FFA57FA828A942F39E6D9117FF3A842AF2454A9C4E5567FFF67E4446163F614EBCDEBDDE819F9B91FE47FF4B1701382FF35BE7E704DCC7FEC033C9FDBEA7FAD5451787F7A043D9CCCE9DC5828D4B1B9DA1F38492630E66923D72FE582BF4271F7230481818123DCD844B3C06B68A4DC708133C93F3C7BAC0F3D044AD130130FDE174881E01C843991042D2A69A8C456BCD089914810EE4C852DD43DE0937667C181DBF4C6DA3CC57FD10A18A4447F74241C2618CACD912DE2D80967E3FB8959838128FCFB61F8392B578D76DC7FF1E3B10BB839A52F9C8C32D11FFFF6761646B7BC0E0739FE93D52DFB0A40299F"> : tensor<2x32x3x3xi8>
    %cst_1 = arith.constant dense<[[[[[4.537110e-04, 4.565720e-04, 4.570480e-04], [4.465580e-04, 4.248620e-04, 4.377370e-04], [4.544260e-04, 4.503730e-04, 4.482270e-04]]], [[[4.372600e-04, 4.158020e-04, 4.422660e-04], [4.563330e-04, 4.177090e-04, 4.560950e-04], [4.410740e-04, 4.377370e-04, 4.374980e-04]]]], [[[[4.606250e-04, 4.541870e-04, 4.444120e-04], [4.489420e-04, 4.236700e-04, 3.855230e-04], [4.563330e-04, 4.637240e-04, 4.191400e-04]]], [[[4.277230e-04, 3.893380e-04, 4.627700e-04], [4.460810e-04, 4.467960e-04, 4.429820e-04], [4.401210e-04, 4.448890e-04, 4.282000e-04]]]]]> : tensor<2x2x1x3x3xf16>
    %cst_2 = arith.constant dense<0.000000e+00> : tensor<f16>
    %cst_3 = arith.constant -6.550400e+04 : f16
    %cst_4 = arith.constant 1.280000e+02 : f16
    %cst_5 = arith.constant -1.280000e+02 : f16
    %cst_6 = arith.constant 1.270000e+02 : f16
    %c0_i64 = arith.constant 0 : i64
    %expanded = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4]] : tensor<2x32x5x5xf16> into tensor<2x2x16x5x5xf16>
    %0 = tensor.empty() : tensor<2x2x16x5x5xf16>
    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<2x2x16x5x5xf16>) outs(%0 : tensor<2x2x16x5x5xf16>) {
    ^bb0(%in: f16, %out: f16):
      %23 = math.absf %in : f16
      linalg.yield %23 : f16
    } -> tensor<2x2x16x5x5xf16>
    %2 = tensor.empty() : tensor<2x2x1x5x5xi64>
    %3 = linalg.fill ins(%c0_i64 : i64) outs(%2 : tensor<2x2x1x5x5xi64>) -> tensor<2x2x1x5x5xi64>
    %4 = tensor.empty() : tensor<2x2x1x5x5xf16>
    %5 = linalg.fill ins(%cst_3 : f16) outs(%4 : tensor<2x2x1x5x5xf16>) -> tensor<2x2x1x5x5xf16>
    %6:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel"]} ins(%1 : tensor<2x2x16x5x5xf16>) outs(%5, %3 : tensor<2x2x1x5x5xf16>, tensor<2x2x1x5x5xi64>) {
    ^bb0(%in: f16, %out: f16, %out_8: i64):
      %23 = linalg.index 2 : index
      %24 = arith.index_cast %23 : index to i64
      %25 = arith.maxf %in, %out : f16
      %26 = arith.cmpf ogt, %in, %out : f16
      %27 = arith.select %26, %24, %out_8 : i64
      linalg.yield %25, %27 : f16, i64
    } -> (tensor<2x2x1x5x5xf16>, tensor<2x2x1x5x5xi64>)
    %7 = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6#0 : tensor<2x2x1x5x5xf16>) outs(%0 : tensor<2x2x16x5x5xf16>) {
    ^bb0(%in: f16, %out: f16):
      linalg.yield %in : f16
    } -> tensor<2x2x16x5x5xf16>
    %collapsed = tensor.collapse_shape %7 [[0], [1, 2], [3], [4]] : tensor<2x2x16x5x5xf16> into tensor<2x32x5x5xf16>
    %8 = tensor.empty() : tensor<2x32x5x5xf16>
    %9 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %out: f16):
      %23 = arith.divf %in, %cst_4 : f16
      linalg.yield %23 : f16
    } -> tensor<2x32x5x5xf16>
    %10 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %9 : tensor<2x32x5x5xf16>, tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %in_8: f16, %out: f16):
      %23 = arith.divf %in, %in_8 : f16
      linalg.yield %23 : f16
    } -> tensor<2x32x5x5xf16>
    %11 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %cst_2 : tensor<2x32x5x5xf16>, tensor<f16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %in_8: f16, %out: f16):
      %23 = arith.addf %in, %in_8 : f16
      linalg.yield %23 : f16
    } -> tensor<2x32x5x5xf16>
    %12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %out: f16):
      %23 = math.roundeven %in : f16
      linalg.yield %23 : f16
    } -> tensor<2x32x5x5xf16>
    %13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %out: f16):
      %23 = arith.cmpf ult, %in, %cst_5 : f16
      %24 = arith.select %23, %cst_5, %in : f16
      %25 = arith.cmpf ugt, %24, %cst_6 : f16
      %26 = arith.select %25, %cst_6, %24 : f16
      linalg.yield %26 : f16
    } -> tensor<2x32x5x5xf16>
    %14 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13, %cst_2 : tensor<2x32x5x5xf16>, tensor<f16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %in_8: f16, %out: f16):
      %23 = arith.subf %in, %in_8 : f16
      linalg.yield %23 : f16
    } -> tensor<2x32x5x5xf16>
    %15 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %9 : tensor<2x32x5x5xf16>, tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
    ^bb0(%in: f16, %in_8: f16, %out: f16):
      %23 = arith.mulf %in, %in_8 : f16
      linalg.yield %23 : f16
    } -> tensor<2x32x5x5xf16>
    %16 = tensor.empty() : tensor<2x2x16x3x3xf16>
    %17 = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%cst_1 : tensor<2x2x1x3x3xf16>) outs(%16 : tensor<2x2x16x3x3xf16>) {
    ^bb0(%in: f16, %out: f16):
      linalg.yield %in : f16
    } -> tensor<2x2x16x3x3xf16>
    %collapsed_7 = tensor.collapse_shape %17 [[0], [1, 2], [3], [4]] : tensor<2x2x16x3x3xf16> into tensor<2x32x3x3xf16>
    %18 = tensor.empty() : tensor<2x32x3x3xf16>
    %19 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_0, %collapsed_7 : tensor<2x32x3x3xi8>, tensor<2x32x3x3xf16>) outs(%18 : tensor<2x32x3x3xf16>) {
    ^bb0(%in: i8, %in_8: f16, %out: f16):
      %23 = arith.sitofp %in : i8 to f16
      %24 = arith.mulf %23, %in_8 : f16
      linalg.yield %24 : f16
    } -> tensor<2x32x3x3xf16>
    %20 = tensor.empty() : tensor<2x2x3x3xf16>
    %21 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : tensor<2xf16>) outs(%20 : tensor<2x2x3x3xf16>) {
    ^bb0(%in: f16, %out: f16):
      linalg.yield %in : f16
    } -> tensor<2x2x3x3xf16>
    %22 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%15, %19 : tensor<2x32x5x5xf16>, tensor<2x32x3x3xf16>) outs(%21 : tensor<2x2x3x3xf16>) -> tensor<2x2x3x3xf16>
    return %22 : tensor<2x2x3x3xf16>
  }
 }

 pytorch result.
 tensor([[[[ 0.0169,  0.9688,  0.1151],
          [ 0.5601, -0.8652,  1.0654],
          [ 0.3960, -0.8550, -0.6348]],

         [[-0.4011, -0.2404,  0.0652],
          [ 0.3621, -0.6089, -0.0455],
          [ 0.6440,  0.1541,  0.9761]]],


        [[[ 0.4189,  0.2705, -0.2681],
          [-0.7920, -0.5820,  0.1659],
          [-0.5610,  0.0285,  0.2966]],

         [[-0.1343, -1.5732, -0.1599],
          [-0.5366,  0.6650,  0.1564],
          [ 1.2070, -0.4768,  0.0576]]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<ConvolutionBackward0>)

 Results: Refbackend torch-mlir
 [[[[ 0.01743  0.967    0.1157 ]
   [ 0.561   -0.863    1.067  ]
   [ 0.3943  -0.854   -0.637  ]]

  [[-0.4004  -0.2391   0.0644 ]
   [ 0.3618  -0.608   -0.0454 ]
   [ 0.645    0.1539   0.9746 ]]]


 [[[ 0.4197   0.2705  -0.266  ]
   [-0.792   -0.5776   0.1655 ]
   [-0.562    0.02806  0.2966 ]]

  [[-0.133   -1.575   -0.1598 ]
   [-0.5366   0.666    0.1559 ]
   [ 1.21    -0.475    0.05792]]]]

 Results: IREE-CPU
 Target triple found:x86_64-linux-gnu
 [[[[ 0.01678  0.969    0.1163 ]
   [ 0.5596  -0.8657   1.063  ]
   [ 0.3967  -0.8545  -0.6377 ]]

  [[-0.4004  -0.2394   0.067  ]
   [ 0.3616  -0.6094  -0.0459 ]
   [ 0.646    0.154    0.974  ]]]


 [[[ 0.4175   0.2703  -0.2666 ]
   [-0.795   -0.5854   0.165  ]
   [-0.5605   0.02707  0.2961 ]]

  [[-0.1333  -1.576   -0.1599 ]
   [-0.5366   0.6685   0.1552 ]
   [ 1.208   -0.477    0.0583 ]]]]
	Args: {'output_path': '.', 'quant_device': 'cuda', 'batch_size': 2, 'bit_width': 8, 'conv_group_size': 16, 'linear_group_size': 16, 'layer_type': 'conv', 'weight_quant': 'group_scale_param', 'input_quant': 'group_dynamic_sym', 'float16': True, 'inline_ts_graph': False, 'onnx_export': False, 'make_fx': True, 'quantize': True}
	Moving model to cuda...
	Run model quantization...
	Quantize conv2d: conv
	Tracing to FX functional representation...
	FX graph model...
	graph():
	%arg0_1 : [#users=2] = placeholder[target=arg0_1]
	%_tensor_constant0 : [#users=1] = get_attr[target=_tensor_constant0]
	%lift_fresh_copy : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant0,), kwargs = {})
	%view : [#users=1] = call_function[target=torch.ops.aten.view](args = (%arg0_1, [2, 2, 16, 5, 5]), kwargs = {})
	%abs_1 : [#users=1] = call_function[target=torch.ops.aten.abs](args = (%view,), kwargs = {})
	%max_1 : [#users=2] = call_function[target=torch.ops.aten.max](args = (%abs_1, 2, True), kwargs = {})
	%getitem : [#users=1] = call_function[target=operator.getitem](args = (%max_1, 0), kwargs = {})
	%getitem_1 : [#users=0] = call_function[target=operator.getitem](args = (%max_1, 1), kwargs = {})
	%view_1 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%getitem, [2, 2, 1, 5, 5]), kwargs = {})
	%expand : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%view_1, [2, 2, 16, 5, 5]), kwargs = {})
	%clone : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%expand,), kwargs = {memory_format: torch.contiguous_format})
	%view_2 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%clone, [2, 32, 5, 5]), kwargs = {})
	%div : [#users=2] = call_function[target=torch.ops.aten.div](args = (%view_2, 128), kwargs = {})
	%_tensor_constant1 : [#users=1] = get_attr[target=_tensor_constant1]
	%detach : [#users=1] = call_function[target=torch.ops.aten.detach](args = (%_tensor_constant1,), kwargs = {})
	%detach_1 : [#users=2] = call_function[target=torch.ops.aten.detach](args = (%detach,), kwargs = {})
	%div_1 : [#users=1] = call_function[target=torch.ops.aten.div](args = (%arg0_1, %div), kwargs = {})
	%add : [#users=1] = call_function[target=torch.ops.aten.add](args = (%div_1, %detach_1), kwargs = {})
	%round_1 : [#users=1] = call_function[target=torch.ops.aten.round](args = (%add,), kwargs = {})
	%clamp : [#users=1] = call_function[target=torch.ops.aten.clamp](args = (%round_1, -128, 127), kwargs = {})
	%sub : [#users=1] = call_function[target=torch.ops.aten.sub](args = (%clamp, %detach_1), kwargs = {})
	%mul : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%sub, %div), kwargs = {})
	%_tensor_constant2 : [#users=1] = get_attr[target=_tensor_constant2]
	%lift_fresh_copy_1 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant2,), kwargs = {})
	%_tensor_constant3 : [#users=1] = get_attr[target=_tensor_constant3]
	%lift_fresh_copy_2 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant3,), kwargs = {})
	%_tensor_constant4 : [#users=1] = get_attr[target=_tensor_constant4]
	%lift_fresh_copy_3 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant4,), kwargs = {})
	%_tensor_constant5 : [#users=1] = get_attr[target=_tensor_constant5]
	%expand_1 : [#users=1] = call_function[target=torch.ops.aten.expand](args = (%_tensor_constant5, [2, 2, 16, 3, 3]), kwargs = {})
	%clone_1 : [#users=1] = call_function[target=torch.ops.aten.clone](args = (%expand_1,), kwargs = {memory_format: torch.contiguous_format})
	%view_3 : [#users=1] = call_function[target=torch.ops.aten.view](args = (%clone_1, [2, 32, 3, 3]), kwargs = {})
	%_tensor_constant6 : [#users=1] = get_attr[target=_tensor_constant6]
	%mul_1 : [#users=1] = call_function[target=torch.ops.aten.mul](args = (%_tensor_constant6, %view_3), kwargs = {})
	%_tensor_constant7 : [#users=1] = get_attr[target=_tensor_constant7]
	%lift_fresh_copy_4 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant7,), kwargs = {})
	%_tensor_constant8 : [#users=1] = get_attr[target=_tensor_constant8]
	%lift_fresh_copy_5 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant8,), kwargs = {})
	%_tensor_constant9 : [#users=1] = get_attr[target=_tensor_constant9]
	%lift_fresh_copy_6 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant9,), kwargs = {})
	%_param_constant0 : [#users=1] = get_attr[target=_param_constant0]
	%convolution : [#users=1] = call_function[target=torch.ops.aten.convolution](args = (%mul, %mul_1, %_param_constant0, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
	%_tensor_constant10 : [#users=1] = get_attr[target=_tensor_constant10]
	%lift_fresh_copy_7 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant10,), kwargs = {})
	%_tensor_constant11 : [#users=1] = get_attr[target=_tensor_constant11]
	%lift_fresh_copy_8 : [#users=0] = call_function[target=torch.ops.aten.lift_fresh_copy](args = (%_tensor_constant11,), kwargs = {})
	return convolution

	# Linalg IR

	#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, 0, d3, d4)>
	#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map3 = affine_map<(d0, d1, d2, d3) -> ()>
	#map4 = affine_map<(d0, d1, d2, d3) -> (d1)>
	module attributes {torch.debug_module_name = "_lambda"} {
	ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
	func.func @forward(%arg0: tensor<2x32x5x5xf16>) -> tensor<2x2x3x3xf16> {
	%cst = arith.constant dense<[4.827880e-02, 6.393430e-03]> : tensor<2xf16>
	%cst_0 = arith.constant dense<"0xD0285B67B6D9816E05B2B2AFDEADBC320D42510BF670057FA034A01AEFFE9666C02F2F4038CEE2E5E79C2F49400E467F8B7F775C187F8949FF81A009C4591DAD33C20FF7D356852E5CE1C863C80A81BB98B653D575158688471D25EE05D41EEEF81B108181FD716948E3B8253A436CA26CE0690437B089E6ED9CB7DAFF0FA1684BDD2BF70D12F5F25DB09DB77489812D50122CE520178481C0757FE17C075FBD9CEF8B47245CCA6721950E301C7F33CC727F88FBA7D3FB65BA7F6510C1D4A6972CAD53587181415FD3240E5E5D4A344678D70A3A8438639AEE75428A7FE143B86BF1B3C7E365DE25F5DA195C4D0E696E9A683A4A821D782926742C4414ECBAE18E20484E2B832DA3C294DCE2FFBC7F841F5E9251C0B703A8A2D681D3503B5A3BCD94983D7F7F988EF20FED3DAF8C07A54C0455053E4D8B5F757F98091C0495BF7BAE3E466B93ED81745C72C2F756ED04FB681FFA57FA828A942F39E6D9117FF3A842AF2454A9C4E5567FFF67E4446163F614EBCDEBDDE819F9B91FE47FF4B1701382FF35BE7E704DCC7FEC033C9FDBEA7FAD5451787F7A043D9CCCE9DC5828D4B1B9DA1F38492630E66923D72FE582BF4271F7230481818123DCD844B3C06B68A4DC708133C93F3C7BAC0F3D044AD130130FDE174881E01C843991042D2A69A8C456BCD089914810EE4C852DD43DE0937667C181DBF4C6DA3CC57FD10A18A4447F74241C2618CACD912DE2D80967E3FB8959838128FCFB61F8392B578D76DC7FF1E3B10BB839A52F9C8C32D11FFFF6761646B7BC0E0739FE93D52DFB0A40299F"> : tensor<2x32x3x3xi8>
	%cst_1 = arith.constant dense<[[[[[4.537110e-04, 4.565720e-04, 4.570480e-04], [4.465580e-04, 4.248620e-04, 4.377370e-04], [4.544260e-04, 4.503730e-04, 4.482270e-04]]], [[[4.372600e-04, 4.158020e-04, 4.422660e-04], [4.563330e-04, 4.177090e-04, 4.560950e-04], [4.410740e-04, 4.377370e-04, 4.374980e-04]]]], [[[[4.606250e-04, 4.541870e-04, 4.444120e-04], [4.489420e-04, 4.236700e-04, 3.855230e-04], [4.563330e-04, 4.637240e-04, 4.191400e-04]]], [[[4.277230e-04, 3.893380e-04, 4.627700e-04], [4.460810e-04, 4.467960e-04, 4.429820e-04], [4.401210e-04, 4.448890e-04, 4.282000e-04]]]]]> : tensor<2x2x1x3x3xf16>
	%cst_2 = arith.constant dense<0.000000e+00> : tensor<f16>
	%cst_3 = arith.constant -6.550400e+04 : f16
	%cst_4 = arith.constant 1.280000e+02 : f16
	%cst_5 = arith.constant -1.280000e+02 : f16
	%cst_6 = arith.constant 1.270000e+02 : f16
	%c0_i64 = arith.constant 0 : i64
	%expanded = tensor.expand_shape %arg0 [[0], [1, 2], [3], [4]] : tensor<2x32x5x5xf16> into tensor<2x2x16x5x5xf16>
	%0 = tensor.empty() : tensor<2x2x16x5x5xf16>
	%1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<2x2x16x5x5xf16>) outs(%0 : tensor<2x2x16x5x5xf16>) {
	^bb0(%in: f16, %out: f16):
	%23 = math.absf %in : f16
	linalg.yield %23 : f16
	} -> tensor<2x2x16x5x5xf16>
	%2 = tensor.empty() : tensor<2x2x1x5x5xi64>
	%3 = linalg.fill ins(%c0_i64 : i64) outs(%2 : tensor<2x2x1x5x5xi64>) -> tensor<2x2x1x5x5xi64>
	%4 = tensor.empty() : tensor<2x2x1x5x5xf16>
	%5 = linalg.fill ins(%cst_3 : f16) outs(%4 : tensor<2x2x1x5x5xf16>) -> tensor<2x2x1x5x5xf16>
	%6:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel"]} ins(%1 : tensor<2x2x16x5x5xf16>) outs(%5, %3 : tensor<2x2x1x5x5xf16>, tensor<2x2x1x5x5xi64>) {
	^bb0(%in: f16, %out: f16, %out_8: i64):
	%23 = linalg.index 2 : index
	%24 = arith.index_cast %23 : index to i64
	%25 = arith.maxf %in, %out : f16
	%26 = arith.cmpf ogt, %in, %out : f16
	%27 = arith.select %26, %24, %out_8 : i64
	linalg.yield %25, %27 : f16, i64
	} -> (tensor<2x2x1x5x5xf16>, tensor<2x2x1x5x5xi64>)
	%7 = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6#0 : tensor<2x2x1x5x5xf16>) outs(%0 : tensor<2x2x16x5x5xf16>) {
	^bb0(%in: f16, %out: f16):
	linalg.yield %in : f16
	} -> tensor<2x2x16x5x5xf16>
	%collapsed = tensor.collapse_shape %7 [[0], [1, 2], [3], [4]] : tensor<2x2x16x5x5xf16> into tensor<2x32x5x5xf16>
	%8 = tensor.empty() : tensor<2x32x5x5xf16>
	%9 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %out: f16):
	%23 = arith.divf %in, %cst_4 : f16
	linalg.yield %23 : f16
	} -> tensor<2x32x5x5xf16>
	%10 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %9 : tensor<2x32x5x5xf16>, tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %in_8: f16, %out: f16):
	%23 = arith.divf %in, %in_8 : f16
	linalg.yield %23 : f16
	} -> tensor<2x32x5x5xf16>
	%11 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %cst_2 : tensor<2x32x5x5xf16>, tensor<f16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %in_8: f16, %out: f16):
	%23 = arith.addf %in, %in_8 : f16
	linalg.yield %23 : f16
	} -> tensor<2x32x5x5xf16>
	%12 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11 : tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %out: f16):
	%23 = math.roundeven %in : f16
	linalg.yield %23 : f16
	} -> tensor<2x32x5x5xf16>
	%13 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12 : tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %out: f16):
	%23 = arith.cmpf ult, %in, %cst_5 : f16
	%24 = arith.select %23, %cst_5, %in : f16
	%25 = arith.cmpf ugt, %24, %cst_6 : f16
	%26 = arith.select %25, %cst_6, %24 : f16
	linalg.yield %26 : f16
	} -> tensor<2x32x5x5xf16>
	%14 = linalg.generic {indexing_maps = [#map2, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13, %cst_2 : tensor<2x32x5x5xf16>, tensor<f16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %in_8: f16, %out: f16):
	%23 = arith.subf %in, %in_8 : f16
	linalg.yield %23 : f16
	} -> tensor<2x32x5x5xf16>
	%15 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%14, %9 : tensor<2x32x5x5xf16>, tensor<2x32x5x5xf16>) outs(%8 : tensor<2x32x5x5xf16>) {
	^bb0(%in: f16, %in_8: f16, %out: f16):
	%23 = arith.mulf %in, %in_8 : f16
	linalg.yield %23 : f16
	} -> tensor<2x32x5x5xf16>
	%16 = tensor.empty() : tensor<2x2x16x3x3xf16>
	%17 = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%cst_1 : tensor<2x2x1x3x3xf16>) outs(%16 : tensor<2x2x16x3x3xf16>) {
	^bb0(%in: f16, %out: f16):
	linalg.yield %in : f16
	} -> tensor<2x2x16x3x3xf16>
	%collapsed_7 = tensor.collapse_shape %17 [[0], [1, 2], [3], [4]] : tensor<2x2x16x3x3xf16> into tensor<2x32x3x3xf16>
	%18 = tensor.empty() : tensor<2x32x3x3xf16>
	%19 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_0, %collapsed_7 : tensor<2x32x3x3xi8>, tensor<2x32x3x3xf16>) outs(%18 : tensor<2x32x3x3xf16>) {
	^bb0(%in: i8, %in_8: f16, %out: f16):
	%23 = arith.sitofp %in : i8 to f16
	%24 = arith.mulf %23, %in_8 : f16
	linalg.yield %24 : f16
	} -> tensor<2x32x3x3xf16>
	%20 = tensor.empty() : tensor<2x2x3x3xf16>
	%21 = linalg.generic {indexing_maps = [#map4, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : tensor<2xf16>) outs(%20 : tensor<2x2x3x3xf16>) {
	^bb0(%in: f16, %out: f16):
	linalg.yield %in : f16
	} -> tensor<2x2x3x3xf16>
	%22 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%15, %19 : tensor<2x32x5x5xf16>, tensor<2x32x3x3xf16>) outs(%21 : tensor<2x2x3x3xf16>) -> tensor<2x2x3x3xf16>
	return %22 : tensor<2x2x3x3xf16>
	}
	}

	pytorch result.
	tensor([[[[ 0.0169, 0.9688, 0.1151],
	[ 0.5601, -0.8652, 1.0654],
	[ 0.3960, -0.8550, -0.6348]],

	[[-0.4011, -0.2404, 0.0652],
	[ 0.3621, -0.6089, -0.0455],
	[ 0.6440, 0.1541, 0.9761]]],


	[[[ 0.4189, 0.2705, -0.2681],
	[-0.7920, -0.5820, 0.1659],
	[-0.5610, 0.0285, 0.2966]],

	[[-0.1343, -1.5732, -0.1599],
	[-0.5366, 0.6650, 0.1564],
	[ 1.2070, -0.4768, 0.0576]]]], device='cuda:0', dtype=torch.float16,
	grad_fn=<ConvolutionBackward0>)

	Results: Refbackend torch-mlir
	[[[[ 0.01743 0.967 0.1157 ]
	[ 0.561 -0.863 1.067 ]
	[ 0.3943 -0.854 -0.637 ]]

	[[-0.4004 -0.2391 0.0644 ]
	[ 0.3618 -0.608 -0.0454 ]
	[ 0.645 0.1539 0.9746 ]]]


	[[[ 0.4197 0.2705 -0.266 ]
	[-0.792 -0.5776 0.1655 ]
	[-0.562 0.02806 0.2966 ]]

	[[-0.133 -1.575 -0.1598 ]
	[-0.5366 0.666 0.1559 ]
	[ 1.21 -0.475 0.05792]]]]

	Results: IREE-CPU
	Target triple found:x86_64-linux-gnu
	[[[[ 0.01678 0.969 0.1163 ]
	[ 0.5596 -0.8657 1.063 ]
	[ 0.3967 -0.8545 -0.6377 ]]

	[[-0.4004 -0.2394 0.067 ]
	[ 0.3616 -0.6094 -0.0459 ]
	[ 0.646 0.154 0.974 ]]]


	[[[ 0.4175 0.2703 -0.2666 ]
	[-0.795 -0.5854 0.165 ]
	[-0.5605 0.02707 0.2961 ]]

	[[-0.1333 -1.576 -0.1599 ]
	[-0.5366 0.6685 0.1552 ]
	[ 1.208 -0.477 0.0583 ]]]]