AlexeySachkov · August 25, 2021 13:58
diff --git a/README.md b/README.md
diff --git a/benchmark.cpp b/benchmark.cpp
 #include <CL/sycl.hpp>

 #include <iostream>
 #include <fstream>

 using namespace sycl;

 using coeff_t = std::array<std::array<float, 3>, 3>;

 // Read coefficients from somewhere.
 coeff_t get_coefficients() {
  std::ifstream f("coefficients.txt");
  assert(f.good() && "unable to open coefficients.txt");
  coeff_t res;
  for (unsigned i = 0; i < res.size(); ++i) {
    for (unsigned j = 0; j < res[0].size(); ++j) {
      f >> res[i][j];
    }
  }

  return res;
 }

 float convolution(item<2> item_id,
                  const accessor<float, 2, access_mode::read> &in_acc,
                  coeff_t coeff) {
  float res = 0;
  for (int i = -1; i <= 1; i++) {
    if (item_id[0] + i < 0 || item_id[0] + i >= in_acc.get_range()[0])
      continue;

    for (int j = -1; j <= 1; j++) {
      if (item_id[1] + j < 0 || item_id[1] + j >= in_acc.get_range()[1])
        continue;

      res += coeff[i + 1][j + 1] * in_acc[item_id[0] + i][item_id[1] + j];
    }
  }
  return res;
 }

 // Identify the specialization constant.
 constexpr specialization_id<coeff_t> coeff_id;

 class RegularConvolutionKernel;
 class SpecConstantsConvolutionKernel;

 int main(int argc, char *argv[]) {
  // Enable queue profiling to be able to precisely measure kernel execution
  // time
  queue q(gpu_selector{}, property_list{property::queue::enable_profiling()});
  std::cout << "Launching on " << q.get_device().get_info<info::device::name>()
            << std::endl;
  std::cout << "Driver version: "
            << q.get_device().get_info<info::device::driver_version>()
            << std::endl;

  int H = 3840, W = 2160;

  if (argc > 1) {
    H = std::atoi(argv[1]);
  }
  if (argc > 2) {
    W = std::atoi(argv[2]);
  }

  std::cout << "Using buffer of size " << H << " x " << W << std::endl;

  // FIXME: we are using uninitialized data just to demonstrate the performance
  buffer<float, 2> in(range<2>(H, W));
  buffer<float, 2> out(range<2>(H, W));

  auto coefficients = get_coefficients();
  std::cout << "Using filter: " << std::endl;
  for (unsigned i = 0; i < coefficients.size(); ++i) {
    for (unsigned j = 0; j < coefficients[0].size(); ++j) {
      std::cout << coefficients[i][j] << " ";
    }
    std::cout << std::endl;
  }

  auto regular_kernel_event = q.submit([&](handler &cgh) {
    accessor in_acc{in, cgh, read_only};
    accessor out_acc{out, cgh, write_only};

    auto coeff = coefficients;
    cgh.parallel_for<RegularConvolutionKernel>(
        in.get_range(), [=](item<2> item_id) {
      out_acc[item_id] = convolution(item_id, in_acc, coeff);
    });
  });

  q.wait();

  auto spec_constants_kernel_event = q.submit([&](handler &cgh) {
    accessor in_acc{in, cgh, read_only};
    accessor out_acc{out, cgh, write_only};

    cgh.set_specialization_constant<coeff_id>(coefficients);

    cgh.parallel_for<SpecConstantsConvolutionKernel>(
        in.get_range(), [=](item<2> item_id, kernel_handler kh) {
      auto coeff = kh.get_specialization_constant<coeff_id>();
      out_acc[item_id] = convolution(item_id, in_acc, coeff);
    });
  });

  q.wait();

  {
    std::cout << "Regular kernel took: ";
    auto start = regular_kernel_event.template get_profiling_info<
        info::event_profiling::command_start>();
    auto end =
        regular_kernel_event
            .template get_profiling_info<info::event_profiling::command_end>();
    std::cout << (end - start) / 1000000000.0 << " seconds" << std::endl;
  }
  {
    std::cout << "Spec constants kernel took: ";
    auto start = spec_constants_kernel_event.template get_profiling_info<
        info::event_profiling::command_start>();
    auto end =
        spec_constants_kernel_event
            .template get_profiling_info<info::event_profiling::command_end>();
    std::cout << (end - start) / 1000000000.0 << " seconds" << std::endl;
  }

  return 0;
 }
diff --git a/coefficients.txt b/coefficients.txt
 -1 -2 -1
 0 0 0
 1 2 1
	#include <CL/sycl.hpp>

	#include <iostream>
	#include <fstream>

	using namespace sycl;

	using coeff_t = std::array<std::array<float, 3>, 3>;

	// Read coefficients from somewhere.
	coeff_t get_coefficients() {
	std::ifstream f("coefficients.txt");
	assert(f.good() && "unable to open coefficients.txt");
	coeff_t res;
	for (unsigned i = 0; i < res.size(); ++i) {
	for (unsigned j = 0; j < res[0].size(); ++j) {
	f >> res[i][j];
	}
	}

	return res;
	}

	float convolution(item<2> item_id,
	const accessor<float, 2, access_mode::read> &in_acc,
	coeff_t coeff) {
	float res = 0;
	for (int i = -1; i <= 1; i++) {
	if (item_id[0] + i < 0 \|\| item_id[0] + i >= in_acc.get_range()[0])
	continue;

	for (int j = -1; j <= 1; j++) {
	if (item_id[1] + j < 0 \|\| item_id[1] + j >= in_acc.get_range()[1])
	continue;

	res += coeff[i + 1][j + 1] * in_acc[item_id[0] + i][item_id[1] + j];
	}
	}
	return res;
	}

	// Identify the specialization constant.
	constexpr specialization_id<coeff_t> coeff_id;

	class RegularConvolutionKernel;
	class SpecConstantsConvolutionKernel;

	int main(int argc, char *argv[]) {
	// Enable queue profiling to be able to precisely measure kernel execution
	// time
	queue q(gpu_selector{}, property_list{property::queue::enable_profiling()});
	std::cout << "Launching on " << q.get_device().get_info<info::device::name>()
	<< std::endl;
	std::cout << "Driver version: "
	<< q.get_device().get_info<info::device::driver_version>()
	<< std::endl;

	int H = 3840, W = 2160;

	if (argc > 1) {
	H = std::atoi(argv[1]);
	}
	if (argc > 2) {
	W = std::atoi(argv[2]);
	}

	std::cout << "Using buffer of size " << H << " x " << W << std::endl;

	// FIXME: we are using uninitialized data just to demonstrate the performance
	buffer<float, 2> in(range<2>(H, W));
	buffer<float, 2> out(range<2>(H, W));

	auto coefficients = get_coefficients();
	std::cout << "Using filter: " << std::endl;
	for (unsigned i = 0; i < coefficients.size(); ++i) {
	for (unsigned j = 0; j < coefficients[0].size(); ++j) {
	std::cout << coefficients[i][j] << " ";
	}
	std::cout << std::endl;
	}

	auto regular_kernel_event = q.submit([&](handler &cgh) {
	accessor in_acc{in, cgh, read_only};
	accessor out_acc{out, cgh, write_only};

	auto coeff = coefficients;
	cgh.parallel_for<RegularConvolutionKernel>(
	in.get_range(), [=](item<2> item_id) {
	out_acc[item_id] = convolution(item_id, in_acc, coeff);
	});
	});

	q.wait();

	auto spec_constants_kernel_event = q.submit([&](handler &cgh) {
	accessor in_acc{in, cgh, read_only};
	accessor out_acc{out, cgh, write_only};

	cgh.set_specialization_constant<coeff_id>(coefficients);

	cgh.parallel_for<SpecConstantsConvolutionKernel>(
	in.get_range(), [=](item<2> item_id, kernel_handler kh) {
	auto coeff = kh.get_specialization_constant<coeff_id>();
	out_acc[item_id] = convolution(item_id, in_acc, coeff);
	});
	});

	q.wait();

	{
	std::cout << "Regular kernel took: ";
	auto start = regular_kernel_event.template get_profiling_info<
	info::event_profiling::command_start>();
	auto end =
	regular_kernel_event
	.template get_profiling_info<info::event_profiling::command_end>();
	std::cout << (end - start) / 1000000000.0 << " seconds" << std::endl;
	}
	{
	std::cout << "Spec constants kernel took: ";
	auto start = spec_constants_kernel_event.template get_profiling_info<
	info::event_profiling::command_start>();
	auto end =
	spec_constants_kernel_event
	.template get_profiling_info<info::event_profiling::command_end>();
	std::cout << (end - start) / 1000000000.0 << " seconds" << std::endl;
	}

	return 0;
	}