TApplencourt · February 24, 2022 19:55
diff --git a/concurent_sycl.cpp b/concurent_sycl.cpp
 #define MAD_4(x, y)                                                                                \
  x = y * x + y;                                                                                   \
  y = x * y + x;                                                                                   \
  x = y * x + y;                                                                                   \
  y = x * y + x;
 #define MAD_16(x, y)                                                                               \
  MAD_4(x, y);                                                                                     \
  MAD_4(x, y);                                                                                     \
  MAD_4(x, y);                                                                                     \
  MAD_4(x, y);
 #define MAD_64(x, y)                                                                               \
  MAD_16(x, y);                                                                                    \
  MAD_16(x, y);                                                                                    \
  MAD_16(x, y);                                                                                    \
  MAD_16(x, y);

 #include <chrono>
 #include <iostream>
 #include <sycl/sycl.hpp>

 template <class T> T busy_wait(long N, T i) {
  T x = 1.3f;
  T y = (T)i;
  for (long j = 0; j < N; j++) {
    MAD_64(x, y);
  }
  return y;
 }

 template <class T>
 void out_of_order_queue(std::vector<std::string> modes, long C_tripcount, bool enable_profiling,
                        bool in_order, int n_queues, bool serial, long *total_cpu_time,
                        int *max_index_cpu_tine_command = NULL, long *max_cpu_time_command = NULL,
                        bool *concurent = NULL) {
  const sycl::device D{sycl::gpu_selector()};
  const int globalWIs = D.get_info<sycl::info::device::sub_group_sizes>()[0];
  const int N = D.get_info<sycl::info::device::max_mem_alloc_size>() / sizeof(T);

  const sycl::context C(D);

  if (n_queues == -1)
    n_queues = modes.size();

  sycl::property_list pl;
  if (enable_profiling && in_order)
    pl = sycl::property_list{sycl::property::queue::in_order{},
                             sycl::property::queue::enable_profiling{}};
  else if (enable_profiling)
    pl = sycl::property_list{sycl::property::queue::enable_profiling{}};
  else if (in_order)
    pl = sycl::property_list{sycl::property::queue::in_order{}};

  std::vector<sycl::queue> Qs;
  for (int i = 0; i < n_queues; i++)
    Qs.push_back(sycl::queue(C, D, pl));

  std::vector<std::vector<T *>> buffers;
  for (auto &mode : modes) {
    std::vector<T *> buffer;
    for (const char &t : mode) {
      T *b;
      if (t == 'D') {
        b = sycl::malloc_device<T>(N, D, C);
      } else if (t == 'H') {
        b = sycl::malloc_host<T>(N, C);
      } else if (t == 'M') {
        b = static_cast<T *>(malloc(N * sizeof(T)));
        std::fill(b, b + N, 0);
      } else if (t == 'C') {
        b = sycl::malloc_device<T>(globalWIs, D, C);
      }
      buffer.push_back(b);
    }
    buffers.push_back(buffer);
  }

  std::vector<long> cpu_times;
  std::vector<sycl::event> events;
  std::chrono::system_clock::time_point s0;

  for (int r = 0; r < 2; r++) {
    cpu_times.clear();
    events.clear();

    s0 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < modes.size(); i++) {
      const auto s = std::chrono::high_resolution_clock::now();
      sycl::queue Q = Qs[i % n_queues];
      sycl::event event;
      if (modes[i] == "C") {
        T *ptr = buffers[i][0];
        event = Q.parallel_for(globalWIs, [ptr, C_tripcount](sycl::item<1> j) {
          ptr[j] = busy_wait(C_tripcount, (T)j);
        });
      } else {
        event = Q.copy(buffers[i][1], buffers[i][0], N);
      }

      if (enable_profiling)
        events.push_back(event);

      if (serial) {
        Q.wait();
        const auto e = std::chrono::high_resolution_clock::now();
        cpu_times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(e - s).count());
      }
    }

    for (auto &Q : Qs)
      Q.wait();
  }

  const auto e0 = std::chrono::high_resolution_clock::now();

  for (const auto &buffer : buffers)
    for (auto &ptr : buffer)
      (sycl::get_pointer_type(ptr, C) != sycl::usm::alloc::unknown) ? sycl::free(ptr, C)
                                                                    : free(ptr);

  *total_cpu_time = {std::chrono::duration_cast<std::chrono::microseconds>(e0 - s0).count()};
  if (serial) {
    *max_index_cpu_tine_command =
        std::distance(cpu_times.begin(), std::max_element(cpu_times.begin(), cpu_times.end()));
    *max_cpu_time_command = cpu_times[*max_index_cpu_tine_command];
  }

  if (enable_profiling) {
    *concurent = false;
    for (int i = 0; i < events.size(); i++) {
      for (int j = 0; j < events.size(); j++) {
        if (i == j)
          continue;
        const auto starti =
            events[i].get_profiling_info<sycl::info::event_profiling::command_start>();
        const auto startj =
            events[j].get_profiling_info<sycl::info::event_profiling::command_start>();
        const auto endj = events[j].get_profiling_info<sycl::info::event_profiling::command_end>();
        if ((startj <= starti) && (starti <= endj)) {
          *concurent = true;
          return;
        }
      }
    }
  }
 }

 int main(int argc, char *argv[]) {
  long C_tripcount = atoi(argv[1]);
  bool enable_profiling = (std::string{argv[2]} == "enable_profiling");
  bool in_order = (std::string{argv[3]} == "in_order");
  int n_queues = atoi(argv[4]);
  std::vector<std::string> modes(argv + 5, argv + argc);

  long serial_total_cpu_time;
  int serial_max_cpu_time_index_command;
  long serial_max_cpu_time_command;
  bool serial_concurent;
  out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, true,
                            &serial_total_cpu_time, &serial_max_cpu_time_index_command,
                            &serial_max_cpu_time_command, &serial_concurent);
  std::cout << "Total serial (us): " << serial_total_cpu_time << " (max commands (us) was "
            << modes[serial_max_cpu_time_index_command] << ": " << serial_max_cpu_time_command
            << ")" << std::endl;

  long concurent_total_cpu_time;
  bool concurent;

  out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, false,
                            &concurent_total_cpu_time, NULL, NULL, &concurent);
  std::cout << "Total // (us):     " << concurent_total_cpu_time << std::endl;
  std::cout << "Got " << 1. * serial_total_cpu_time / concurent_total_cpu_time
            << "x speed-up relative to serial," << std::endl;
  std::cout << "was expecting (assuming maximun concurency and negligeable runtime overhead) "
            << 1. * serial_total_cpu_time / serial_max_cpu_time_command << "x" << std::endl;

  if (enable_profiling && concurent) {
    std::cout << "SUCCESS: SYCL Event show concurentcy" << std::endl;
    return 0;
  }

  if (concurent_total_cpu_time <= 1.20 * serial_max_cpu_time_command) {
    std::cout << "SUCCESS: Possible Concurent execution" << std::endl;
    return 0;
  }
  std::cout << "FAILURE: No Concurent Execution" << std::endl;
  return 1;
 }
	#define MAD_4(x, y) \
	x = y * x + y; \
	y = x * y + x; \
	x = y * x + y; \
	y = x * y + x;
	#define MAD_16(x, y) \
	MAD_4(x, y); \
	MAD_4(x, y); \
	MAD_4(x, y); \
	MAD_4(x, y);
	#define MAD_64(x, y) \
	MAD_16(x, y); \
	MAD_16(x, y); \
	MAD_16(x, y); \
	MAD_16(x, y);

	#include <chrono>
	#include <iostream>
	#include <sycl/sycl.hpp>

	template <class T> T busy_wait(long N, T i) {
	T x = 1.3f;
	T y = (T)i;
	for (long j = 0; j < N; j++) {
	MAD_64(x, y);
	}
	return y;
	}

	template <class T>
	void out_of_order_queue(std::vector<std::string> modes, long C_tripcount, bool enable_profiling,
	bool in_order, int n_queues, bool serial, long *total_cpu_time,
	int max_index_cpu_tine_command = NULL, long max_cpu_time_command = NULL,
	bool *concurent = NULL) {
	const sycl::device D{sycl::gpu_selector()};
	const int globalWIs = D.get_info<sycl::info::device::sub_group_sizes>()[0];
	const int N = D.get_info<sycl::info::device::max_mem_alloc_size>() / sizeof(T);

	const sycl::context C(D);

	if (n_queues == -1)
	n_queues = modes.size();

	sycl::property_list pl;
	if (enable_profiling && in_order)
	pl = sycl::property_list{sycl::property::queue::in_order{},
	sycl::property::queue::enable_profiling{}};
	else if (enable_profiling)
	pl = sycl::property_list{sycl::property::queue::enable_profiling{}};
	else if (in_order)
	pl = sycl::property_list{sycl::property::queue::in_order{}};

	std::vector<sycl::queue> Qs;
	for (int i = 0; i < n_queues; i++)
	Qs.push_back(sycl::queue(C, D, pl));

	std::vector<std::vector<T *>> buffers;
	for (auto &mode : modes) {
	std::vector<T *> buffer;
	for (const char &t : mode) {
	T *b;
	if (t == 'D') {
	b = sycl::malloc_device<T>(N, D, C);
	} else if (t == 'H') {
	b = sycl::malloc_host<T>(N, C);
	} else if (t == 'M') {
	b = static_cast<T >(malloc(N sizeof(T)));
	std::fill(b, b + N, 0);
	} else if (t == 'C') {
	b = sycl::malloc_device<T>(globalWIs, D, C);
	}
	buffer.push_back(b);
	}
	buffers.push_back(buffer);
	}

	std::vector<long> cpu_times;
	std::vector<sycl::event> events;
	std::chrono::system_clock::time_point s0;

	for (int r = 0; r < 2; r++) {
	cpu_times.clear();
	events.clear();

	s0 = std::chrono::high_resolution_clock::now();
	for (int i = 0; i < modes.size(); i++) {
	const auto s = std::chrono::high_resolution_clock::now();
	sycl::queue Q = Qs[i % n_queues];
	sycl::event event;
	if (modes[i] == "C") {
	T *ptr = buffers[i][0];
	event = Q.parallel_for(globalWIs, [ptr, C_tripcount](sycl::item<1> j) {
	ptr[j] = busy_wait(C_tripcount, (T)j);
	});
	} else {
	event = Q.copy(buffers[i][1], buffers[i][0], N);
	}

	if (enable_profiling)
	events.push_back(event);

	if (serial) {
	Q.wait();
	const auto e = std::chrono::high_resolution_clock::now();
	cpu_times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(e - s).count());
	}
	}

	for (auto &Q : Qs)
	Q.wait();
	}

	const auto e0 = std::chrono::high_resolution_clock::now();

	for (const auto &buffer : buffers)
	for (auto &ptr : buffer)
	(sycl::get_pointer_type(ptr, C) != sycl::usm::alloc::unknown) ? sycl::free(ptr, C)
	: free(ptr);

	*total_cpu_time = {std::chrono::duration_cast<std::chrono::microseconds>(e0 - s0).count()};
	if (serial) {
	*max_index_cpu_tine_command =
	std::distance(cpu_times.begin(), std::max_element(cpu_times.begin(), cpu_times.end()));
	max_cpu_time_command = cpu_times[max_index_cpu_tine_command];
	}

	if (enable_profiling) {
	*concurent = false;
	for (int i = 0; i < events.size(); i++) {
	for (int j = 0; j < events.size(); j++) {
	if (i == j)
	continue;
	const auto starti =
	events[i].get_profiling_info<sycl::info::event_profiling::command_start>();
	const auto startj =
	events[j].get_profiling_info<sycl::info::event_profiling::command_start>();
	const auto endj = events[j].get_profiling_info<sycl::info::event_profiling::command_end>();
	if ((startj <= starti) && (starti <= endj)) {
	*concurent = true;
	return;
	}
	}
	}
	}
	}

	int main(int argc, char *argv[]) {
	long C_tripcount = atoi(argv[1]);
	bool enable_profiling = (std::string{argv[2]} == "enable_profiling");
	bool in_order = (std::string{argv[3]} == "in_order");
	int n_queues = atoi(argv[4]);
	std::vector<std::string> modes(argv + 5, argv + argc);

	long serial_total_cpu_time;
	int serial_max_cpu_time_index_command;
	long serial_max_cpu_time_command;
	bool serial_concurent;
	out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, true,
	&serial_total_cpu_time, &serial_max_cpu_time_index_command,
	&serial_max_cpu_time_command, &serial_concurent);
	std::cout << "Total serial (us): " << serial_total_cpu_time << " (max commands (us) was "
	<< modes[serial_max_cpu_time_index_command] << ": " << serial_max_cpu_time_command
	<< ")" << std::endl;

	long concurent_total_cpu_time;
	bool concurent;

	out_of_order_queue<float>(modes, C_tripcount, enable_profiling, in_order, n_queues, false,
	&concurent_total_cpu_time, NULL, NULL, &concurent);
	std::cout << "Total // (us): " << concurent_total_cpu_time << std::endl;
	std::cout << "Got " << 1. * serial_total_cpu_time / concurent_total_cpu_time
	<< "x speed-up relative to serial," << std::endl;
	std::cout << "was expecting (assuming maximun concurency and negligeable runtime overhead) "
	<< 1. * serial_total_cpu_time / serial_max_cpu_time_command << "x" << std::endl;

	if (enable_profiling && concurent) {
	std::cout << "SUCCESS: SYCL Event show concurentcy" << std::endl;
	return 0;
	}

	if (concurent_total_cpu_time <= 1.20 * serial_max_cpu_time_command) {
	std::cout << "SUCCESS: Possible Concurent execution" << std::endl;
	return 0;
	}
	std::cout << "FAILURE: No Concurent Execution" << std::endl;
	return 1;
	}