Last active
February 24, 2022 19:55
-
-
Save TApplencourt/cdb8d5b3de9899cab7ce2da7529d9e94 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MAD_4(x, y) \ | |
x = y * x + y; \ | |
y = x * y + x; \ | |
x = y * x + y; \ | |
y = x * y + x; | |
#define MAD_16(x, y) \ | |
MAD_4(x, y); \ | |
MAD_4(x, y); \ | |
MAD_4(x, y); \ | |
MAD_4(x, y); | |
#define MAD_64(x, y) \ | |
MAD_16(x, y); \ | |
MAD_16(x, y); \ | |
MAD_16(x, y); \ | |
MAD_16(x, y); | |
#include <chrono> | |
#include <iostream> | |
#include <sycl/sycl.hpp> | |
template <class T> T busy_wait(long N, T i) { | |
T x = 1.3f; | |
T y = (T)i; | |
for (long j = 0; j < N; j++) { | |
MAD_64(x, y); | |
} | |
return y; | |
} | |
template <class T> | |
void out_of_order_queue(std::vector<std::string> modes, long C_tripcount, bool enable_profiling, | |
bool in_order, int n_queues, bool serial, long *total_cpu_time, | |
int *max_index_cpu_tine_command = NULL, long *max_cpu_time_command = NULL, | |
bool *concurent = NULL) { | |
const sycl::device D{sycl::gpu_selector()}; | |
const int globalWIs = D.get_info<sycl::info::device::sub_group_sizes>()[0]; | |
const int N = D.get_info<sycl::info::device::max_mem_alloc_size>() / sizeof(T); | |
const sycl::context C(D); | |
if (n_queues == -1) | |
n_queues = modes.size(); | |
sycl::property_list pl; | |
if (enable_profiling && in_order) | |
pl = sycl::property_list{sycl::property::queue::in_order{}, | |
sycl::property::queue::enable_profiling{}}; | |
else if (enable_profiling) | |
pl = sycl::property_list{sycl::property::queue::enable_profiling{}}; | |
else if (in_order) | |
pl = sycl::property_list{sycl::property::queue::in_order{}}; | |
std::vector<sycl::queue> Qs; | |
for (int i = 0; i < n_queues; i++) | |
Qs.push_back(sycl::queue(C, D, pl)); | |
std::vector<std::vector<T *>> buffers; | |
for (auto &mode : modes) { | |
std::vector<T *> buffer; | |
for (const char &t : mode) { | |
T *b; | |
if (t == 'D') { | |
b = sycl::malloc_device<T>(N, D, C); | |
} else if (t == 'H') { | |
b = sycl::malloc_host<T>(N, C); | |
} else if (t == 'M') { | |
b = static_cast<T *>(malloc(N * sizeof(T))); | |
std::fill(b, b + N, 0); | |
} else if (t == 'C') { | |
b = sycl::malloc_device<T>(globalWIs, D, C); | |
} | |
buffer.push_back(b); | |
} | |
buffers.push_back(buffer); | |
} | |
std::vector<long> cpu_times; | |
std::vector<sycl::event> events; | |
std::chrono::system_clock::time_point s0; | |
for (int r = 0; r < 2; r++) { | |
cpu_times.clear(); | |
events.clear(); | |
s0 = std::chrono::high_resolution_clock::now(); | |
for (int i = 0; i < modes.size(); i++) { | |
const auto s = std::chrono::high_resolution_clock::now(); | |
sycl::queue Q = Qs[i % n_queues]; | |
sycl::event event; | |
if (modes[i] == "C") { | |
T *ptr = buffers[i][0]; | |
event = Q.parallel_for(globalWIs, [ptr, C_tripcount](sycl::item<1> j) { | |
ptr[j] = busy_wait(C_tripcount, (T)j); | |
}); | |
} else { | |
event = Q.copy(buffers[i][1], buffers[i][0], N); | |
} | |
if (enable_profiling) | |
events.push_back(event); | |
if (serial) { | |
Q.wait(); | |
const auto e = std::chrono::high_resolution_clock::now(); | |
cpu_times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(e - s).count()); | |
} | |
} | |
for (auto &Q : Qs) | |
Q.wait(); | |
} | |
const auto e0 = std::chrono::high_resolution_clock::now(); | |
for (const auto &buffer : buffers) | |
for (auto &ptr : buffer) | |
(sycl::get_pointer_type(ptr, C) != sycl::usm::alloc::unknown) ? sycl::free(ptr, C) | |
: free(ptr); | |
*total_cpu_time = {std::chrono::duration_cast<std::chrono::microseconds>(e0 - s0).count()}; | |
if (serial) { | |
*max_index_cpu_tine_command = | |
std::distance(cpu_times.begin(), std::max_element(cpu_times.begin(), cpu_times.end())); | |
*max_cpu_time_command = cpu_times[*max_index_cpu_tine_command]; | |
} | |
if (enable_profiling) { | |
*concurent = false; | |
for (int i = 0; i < events.size(); i++) { | |
for (int j = 0; j < events.size(); j++) { | |
if (i == j) | |
continue; | |
const auto starti = | |
events[i].get_profiling_info<sycl::info::event_profiling::command_start>(); | |
const auto startj = | |
events[j].get_profiling_info<sycl::info::event_profiling::command_start>(); | |
const auto endj = events[j].get_profiling_info<sycl::info::event_profiling::command_end>(); | |
if ((startj <= starti) && (starti <= endj)) { | |
*concurent = true; | |
return; | |
} | |
} | |
} | |
} | |
} | |
int main(int argc, char *argv[]) { | |
long C_tripcount = atoi(argv[1]); | |
bool enable_profiling = (std::string{argv[2]} == "enable_profiling"); | |
bool in_order = (std::string{argv[3]} == "in_order"); | |
int n_queues = atoi(argv[4]); | |
std::vector<std::string> modes(argv + 5, argv + argc); | |
long serial_total_cpu_time; | |
int serial_max_cpu_time_index_command; | |
long serial_max_cpu_time_command; | |
bool serial_concurent; | |
out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, true, | |
&serial_total_cpu_time, &serial_max_cpu_time_index_command, | |
&serial_max_cpu_time_command, &serial_concurent); | |
std::cout << "Total serial (us): " << serial_total_cpu_time << " (max commands (us) was " | |
<< modes[serial_max_cpu_time_index_command] << ": " << serial_max_cpu_time_command | |
<< ")" << std::endl; | |
long concurent_total_cpu_time; | |
bool concurent; | |
out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, false, | |
&concurent_total_cpu_time, NULL, NULL, &concurent); | |
std::cout << "Total // (us): " << concurent_total_cpu_time << std::endl; | |
std::cout << "Got " << 1. * serial_total_cpu_time / concurent_total_cpu_time | |
<< "x speed-up relative to serial," << std::endl; | |
std::cout << "was expecting (assuming maximun concurency and negligeable runtime overhead) " | |
<< 1. * serial_total_cpu_time / serial_max_cpu_time_command << "x" << std::endl; | |
if (enable_profiling && concurent) { | |
std::cout << "SUCCESS: SYCL Event show concurentcy" << std::endl; | |
return 0; | |
} | |
if (concurent_total_cpu_time <= 1.20 * serial_max_cpu_time_command) { | |
std::cout << "SUCCESS: Possible Concurent execution" << std::endl; | |
return 0; | |
} | |
std::cout << "FAILURE: No Concurent Execution" << std::endl; | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment