malfet · September 28, 2021 13:49
diff --git a/mup-smaller.cpp b/mup-smaller.cpp
 //  c++ -std=c++14 -O2 -fPIC -Xpreprocessor -fopenmp mup-smaller.cpp
 //
 #include <omp.h>
 #include <atomic>
 #include <exception>
 #include <algorithm>

 #define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))

 namespace at {

 inline int64_t divup(int64_t x, int64_t y) {
  return (x + y - 1) / y;
 }

 void init_num_threads();
 int get_thread_num();

 namespace internal {
 void set_thread_num(int);

 class ThreadIdGuard {
 public:
  ThreadIdGuard(int new_id):
    old_id_(at::get_thread_num()) {
    set_thread_num(new_id);
  }

  ~ThreadIdGuard() {
    set_thread_num(old_id_);
  }

 private:
  int old_id_;
 };
 inline void lazy_init_num_threads() {
  thread_local bool init = false;
  if (C10_UNLIKELY(!init)) {
    at::init_num_threads();
    init = true;
  }
 }

 template <typename F>
 inline void invoke_parallel(int64_t begin, int64_t end, int64_t grain_size, const F& f) {
  std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
  std::exception_ptr eptr;

 #pragma omp parallel
  {
    // choose number of tasks based on grain size and number of threads
    // can't use num_threads clause due to bugs in GOMP's thread pool (See #32008)
    int64_t num_threads = omp_get_num_threads();
    if (grain_size > 0) {
      num_threads = std::min(num_threads, divup((end - begin), grain_size));
    }

    int64_t tid = omp_get_thread_num();
    int64_t chunk_size = divup((end - begin), num_threads);
    int64_t begin_tid = begin + tid * chunk_size;
    if (begin_tid < end) {
      try {
        internal::ThreadIdGuard tid_guard(tid);
        f(begin_tid, std::min(end, chunk_size + begin_tid));
      } catch (...) {
        if (!err_flag.test_and_set()) {
          eptr = std::current_exception();
        }
      }
    }
  }
  if (eptr) {
    std::rethrow_exception(eptr);
  }
 }

 } // namespace internal
 template <class F>
 inline void parallel_for(
    const int64_t begin,
    const int64_t end,
    const int64_t grain_size,
    const F& f) {
  if (begin >= end) {
    return;
  }

  at::internal::lazy_init_num_threads();
  const auto numiter = end - begin;
  const bool use_parallel = (
    numiter > grain_size && numiter > 1 &&
    omp_get_max_threads() > 1 && !omp_in_parallel());
  if (!use_parallel) {
    internal::ThreadIdGuard tid_guard(0);
    f(begin, end);
    return;
  }

  internal::invoke_parallel(begin, end, grain_size, f);
 }
 namespace native {

 void cpu_max_unpool(
    float* input_data,
    int64_t* indices_data ,
    float* output_data,
    int64_t numel,
    int64_t channels,
    int64_t output_depth,
    int64_t output_height,
    int64_t output_width) {


  int64_t output_image_size = numel / channels;

  bool has_error = false;
  int64_t error_index = 0;

  // parallel on dim N, C, D, H, W: [channels, input_image_size]
  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
    for (int64_t i = begin; i < end; i++) {
      float* output_ptr = output_data;

      int64_t maxp = indices_data[i];
      if (maxp < 0 || maxp >= output_image_size) {
        #pragma omp critical
        {
          has_error = true;
          error_index = maxp;
        }
      } else {
        output_ptr[maxp] = input_data[i];
      }
    }
  });

 }

 }} // at::native
	// c++ -std=c++14 -O2 -fPIC -Xpreprocessor -fopenmp mup-smaller.cpp
	//
	#include <omp.h>
	#include <atomic>
	#include <exception>
	#include <algorithm>

	#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))

	namespace at {

	inline int64_t divup(int64_t x, int64_t y) {
	return (x + y - 1) / y;
	}

	void init_num_threads();
	int get_thread_num();

	namespace internal {
	void set_thread_num(int);

	class ThreadIdGuard {
	public:
	ThreadIdGuard(int new_id):
	old_id_(at::get_thread_num()) {
	set_thread_num(new_id);
	}

	~ThreadIdGuard() {
	set_thread_num(old_id_);
	}

	private:
	int old_id_;
	};
	inline void lazy_init_num_threads() {
	thread_local bool init = false;
	if (C10_UNLIKELY(!init)) {
	at::init_num_threads();
	init = true;
	}
	}

	template <typename F>
	inline void invoke_parallel(int64_t begin, int64_t end, int64_t grain_size, const F& f) {
	std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
	std::exception_ptr eptr;

	#pragma omp parallel
	{
	// choose number of tasks based on grain size and number of threads
	// can't use num_threads clause due to bugs in GOMP's thread pool (See #32008)
	int64_t num_threads = omp_get_num_threads();
	if (grain_size > 0) {
	num_threads = std::min(num_threads, divup((end - begin), grain_size));
	}

	int64_t tid = omp_get_thread_num();
	int64_t chunk_size = divup((end - begin), num_threads);
	int64_t begin_tid = begin + tid * chunk_size;
	if (begin_tid < end) {
	try {
	internal::ThreadIdGuard tid_guard(tid);
	f(begin_tid, std::min(end, chunk_size + begin_tid));
	} catch (...) {
	if (!err_flag.test_and_set()) {
	eptr = std::current_exception();
	}
	}
	}
	}
	if (eptr) {
	std::rethrow_exception(eptr);
	}
	}

	} // namespace internal
	template <class F>
	inline void parallel_for(
	const int64_t begin,
	const int64_t end,
	const int64_t grain_size,
	const F& f) {
	if (begin >= end) {
	return;
	}

	at::internal::lazy_init_num_threads();
	const auto numiter = end - begin;
	const bool use_parallel = (
	numiter > grain_size && numiter > 1 &&
	omp_get_max_threads() > 1 && !omp_in_parallel());
	if (!use_parallel) {
	internal::ThreadIdGuard tid_guard(0);
	f(begin, end);
	return;
	}

	internal::invoke_parallel(begin, end, grain_size, f);
	}
	namespace native {

	void cpu_max_unpool(
	float* input_data,
	int64_t* indices_data ,
	float* output_data,
	int64_t numel,
	int64_t channels,
	int64_t output_depth,
	int64_t output_height,
	int64_t output_width) {


	int64_t output_image_size = numel / channels;

	bool has_error = false;
	int64_t error_index = 0;

	// parallel on dim N, C, D, H, W: [channels, input_image_size]
	at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
	for (int64_t i = begin; i < end; i++) {
	float* output_ptr = output_data;

	int64_t maxp = indices_data[i];
	if (maxp < 0 \|\| maxp >= output_image_size) {
	#pragma omp critical
	{
	has_error = true;
	error_index = maxp;
	}
	} else {
	output_ptr[maxp] = input_data[i];
	}
	}
	});

	}

	}} // at::native