|
#include <CL/sycl.hpp> |
|
|
|
#include <iostream> |
|
#include <fstream> |
|
|
|
using namespace sycl; |
|
|
|
using coeff_t = std::array<std::array<float, 3>, 3>; |
|
|
|
// Read coefficients from somewhere. |
|
coeff_t get_coefficients() { |
|
std::ifstream f("coefficients.txt"); |
|
assert(f.good() && "unable to open coefficients.txt"); |
|
coeff_t res; |
|
for (unsigned i = 0; i < res.size(); ++i) { |
|
for (unsigned j = 0; j < res[0].size(); ++j) { |
|
f >> res[i][j]; |
|
} |
|
} |
|
|
|
return res; |
|
} |
|
|
|
float convolution(item<2> item_id, |
|
const accessor<float, 2, access_mode::read> &in_acc, |
|
coeff_t coeff) { |
|
float res = 0; |
|
for (int i = -1; i <= 1; i++) { |
|
if (item_id[0] + i < 0 || item_id[0] + i >= in_acc.get_range()[0]) |
|
continue; |
|
|
|
for (int j = -1; j <= 1; j++) { |
|
if (item_id[1] + j < 0 || item_id[1] + j >= in_acc.get_range()[1]) |
|
continue; |
|
|
|
res += coeff[i + 1][j + 1] * in_acc[item_id[0] + i][item_id[1] + j]; |
|
} |
|
} |
|
return res; |
|
} |
|
|
|
// Identify the specialization constant. |
|
constexpr specialization_id<coeff_t> coeff_id; |
|
|
|
class RegularConvolutionKernel; |
|
class SpecConstantsConvolutionKernel; |
|
|
|
int main(int argc, char *argv[]) { |
|
// Enable queue profiling to be able to precisely measure kernel execution |
|
// time |
|
queue q(gpu_selector{}, property_list{property::queue::enable_profiling()}); |
|
std::cout << "Launching on " << q.get_device().get_info<info::device::name>() |
|
<< std::endl; |
|
std::cout << "Driver version: " |
|
<< q.get_device().get_info<info::device::driver_version>() |
|
<< std::endl; |
|
|
|
int H = 3840, W = 2160; |
|
|
|
if (argc > 1) { |
|
H = std::atoi(argv[1]); |
|
} |
|
if (argc > 2) { |
|
W = std::atoi(argv[2]); |
|
} |
|
|
|
std::cout << "Using buffer of size " << H << " x " << W << std::endl; |
|
|
|
// FIXME: we are using uninitialized data just to demonstrate the performance |
|
buffer<float, 2> in(range<2>(H, W)); |
|
buffer<float, 2> out(range<2>(H, W)); |
|
|
|
auto coefficients = get_coefficients(); |
|
std::cout << "Using filter: " << std::endl; |
|
for (unsigned i = 0; i < coefficients.size(); ++i) { |
|
for (unsigned j = 0; j < coefficients[0].size(); ++j) { |
|
std::cout << coefficients[i][j] << " "; |
|
} |
|
std::cout << std::endl; |
|
} |
|
|
|
auto regular_kernel_event = q.submit([&](handler &cgh) { |
|
accessor in_acc{in, cgh, read_only}; |
|
accessor out_acc{out, cgh, write_only}; |
|
|
|
auto coeff = coefficients; |
|
cgh.parallel_for<RegularConvolutionKernel>( |
|
in.get_range(), [=](item<2> item_id) { |
|
out_acc[item_id] = convolution(item_id, in_acc, coeff); |
|
}); |
|
}); |
|
|
|
q.wait(); |
|
|
|
auto spec_constants_kernel_event = q.submit([&](handler &cgh) { |
|
accessor in_acc{in, cgh, read_only}; |
|
accessor out_acc{out, cgh, write_only}; |
|
|
|
cgh.set_specialization_constant<coeff_id>(coefficients); |
|
|
|
cgh.parallel_for<SpecConstantsConvolutionKernel>( |
|
in.get_range(), [=](item<2> item_id, kernel_handler kh) { |
|
auto coeff = kh.get_specialization_constant<coeff_id>(); |
|
out_acc[item_id] = convolution(item_id, in_acc, coeff); |
|
}); |
|
}); |
|
|
|
q.wait(); |
|
|
|
{ |
|
std::cout << "Regular kernel took: "; |
|
auto start = regular_kernel_event.template get_profiling_info< |
|
info::event_profiling::command_start>(); |
|
auto end = |
|
regular_kernel_event |
|
.template get_profiling_info<info::event_profiling::command_end>(); |
|
std::cout << (end - start) / 1000000000.0 << " seconds" << std::endl; |
|
} |
|
{ |
|
std::cout << "Spec constants kernel took: "; |
|
auto start = spec_constants_kernel_event.template get_profiling_info< |
|
info::event_profiling::command_start>(); |
|
auto end = |
|
spec_constants_kernel_event |
|
.template get_profiling_info<info::event_profiling::command_end>(); |
|
std::cout << (end - start) / 1000000000.0 << " seconds" << std::endl; |
|
} |
|
|
|
return 0; |
|
} |