Mine02C4 · September 30, 2016 18:02 · Mine02C4 · Jul 23, 2015
diff --git a/LightsOutAVX2.cpp b/LightsOutAVX2.cpp
 #include <iostream>
 #include <vector>
 #include <algorithm>
 #include <mutex>

 #define SIMD // AVX2を利用する。
 //#define S64 // テーブルを64bitにする。SIZEを6以上にするために必要

 #ifdef S64
 typedef int64_t table;
 #else
 typedef int table;
 #endif
 const int SIZE = 5;
 const table BASE = 1;
 const table N_BASE = -1;
 const table NUMBER_OF_PATTERN = BASE << (SIZE * SIZE);
 const table TABLE_MASK = ~(N_BASE << (SIZE * SIZE));
 table RIGHT_MASK = 0;
 table LEFT_MASK = 0;

 std::vector<table> results;
 std::mutex mtx;

 #ifdef SIMD
 #include <emmintrin.h>
 typedef __m256i ymm_table;
 const ymm_table YMM_ZERO = _mm256_setzero_si256();
 ymm_table YMM_LEFT_NMASK;
 ymm_table YMM_RIGHT_NMASK;
 #ifdef S64
 const int simd_align = 64;
 const int simd_concurrency = 4;
 const ymm_table YMM_TABLE_MASK = _mm256_set1_epi64x(TABLE_MASK);

 inline int simd64_validate(ymm_table p) {
  ymm_table t = YMM_TABLE_MASK;
  t = _mm256_xor_si256(t, p);
  t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_srli_epi64(p, 1), YMM_LEFT_NMASK));
  t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_slli_epi64(p, 1), YMM_RIGHT_NMASK));
  t = _mm256_xor_si256(t, _mm256_slli_epi64(p, SIZE));
  t = _mm256_xor_si256(t, _mm256_srli_epi64(p, SIZE));
  t = _mm256_and_si256(t, YMM_TABLE_MASK);
  return _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(t, YMM_ZERO)));
 }
 #else
 const int simd_align = 32;
 const int simd_concurrency = 8;
 const ymm_table YMM_TABLE_MASK = _mm256_set1_epi32(TABLE_MASK);

 inline int simd32_validate(ymm_table p) {
  ymm_table t = YMM_TABLE_MASK;
  t = _mm256_xor_si256(t, p);
  t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_srli_epi32(p, 1), YMM_LEFT_NMASK));
  t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_slli_epi32(p, 1), YMM_RIGHT_NMASK));
  t = _mm256_xor_si256(t, _mm256_slli_epi32(p, SIZE));
  t = _mm256_xor_si256(t, _mm256_srli_epi32(p, SIZE));
  t = _mm256_and_si256(t, YMM_TABLE_MASK);
  return _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(t, YMM_ZERO)));
 }
 #endif
 #else
 bool validate(table p) {
  table t = TABLE_MASK;
  t ^= p;
  t ^= (p >> 1) & (~LEFT_MASK);
  t ^= (p << 1) & (~RIGHT_MASK);
  t ^= p << SIZE;
  t ^= p >> SIZE;
  return (t & TABLE_MASK) == 0;
 }
 #endif

 int main() {
  for (int i = 0; i < SIZE; i++) {
    RIGHT_MASK |= BASE << (i * SIZE);
    LEFT_MASK |= (BASE << (SIZE - 1)) << (i * SIZE);
  }
 #ifdef SIMD
 #ifdef S64
  YMM_LEFT_NMASK = _mm256_set1_epi64x(~LEFT_MASK);
  YMM_RIGHT_NMASK = _mm256_set1_epi64x(~RIGHT_MASK);
 #else
  YMM_LEFT_NMASK = _mm256_set1_epi32(~LEFT_MASK);
  YMM_RIGHT_NMASK = _mm256_set1_epi32(~RIGHT_MASK);
 #endif
 #pragma omp parallel for
  for (table i = 0; i < NUMBER_OF_PATTERN; i += simd_concurrency) {
    int m;
 #ifdef S64
    if ((m = simd64_validate(_mm256_set_epi64x(i, i + 1, i + 2, i + 3))) != 0) {
 #else
    if ((m = simd32_validate(_mm256_set_epi32(i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7))) != 0) {
 #endif
      for (int j = 0; j < simd_concurrency; j++) {
        if ((m & (1 << (simd_concurrency - 1 - j))) != 0) {
 #ifdef _OPENMP
          mtx.lock();
 #endif
          results.push_back(i + j);
 #ifdef _OPENMP
          mtx.unlock();
 #endif
        }
      }
    }
    }
 #else
 #pragma omp parallel for
  for (table i = 0; i < NUMBER_OF_PATTERN; i++) {
    if (validate(i)) {
 #ifdef _OPENMP
      mtx.lock();
 #endif
      results.push_back(i);
 #ifdef _OPENMP
      mtx.unlock();
 #endif
    }
  }
 #endif
  std::for_each(results.begin(), results.end(), [](table r)
  {
    for (int i = SIZE - 1; i >= 0; i--) {
      for (int j = SIZE - 1; j >= 0; j--) {
        if ((r & (BASE << (i * SIZE + j))) == 0)
          std::cout << "0";
        else
          std::cout << "1";
      }
      std::cout << std::endl;
    }
    std::cout << std::endl;
  });
  return 0;
 }
	#include <iostream>
	#include <vector>
	#include <algorithm>
	#include <mutex>

	#define SIMD // AVX2を利用する。
	//#define S64 // テーブルを64bitにする。SIZEを6以上にするために必要

	#ifdef S64
	typedef int64_t table;
	#else
	typedef int table;
	#endif
	const int SIZE = 5;
	const table BASE = 1;
	const table N_BASE = -1;
	const table NUMBER_OF_PATTERN = BASE << (SIZE * SIZE);
	const table TABLE_MASK = ~(N_BASE << (SIZE * SIZE));
	table RIGHT_MASK = 0;
	table LEFT_MASK = 0;

	std::vector<table> results;
	std::mutex mtx;

	#ifdef SIMD
	#include <emmintrin.h>
	typedef __m256i ymm_table;
	const ymm_table YMM_ZERO = _mm256_setzero_si256();
	ymm_table YMM_LEFT_NMASK;
	ymm_table YMM_RIGHT_NMASK;
	#ifdef S64
	const int simd_align = 64;
	const int simd_concurrency = 4;
	const ymm_table YMM_TABLE_MASK = _mm256_set1_epi64x(TABLE_MASK);

	inline int simd64_validate(ymm_table p) {
	ymm_table t = YMM_TABLE_MASK;
	t = _mm256_xor_si256(t, p);
	t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_srli_epi64(p, 1), YMM_LEFT_NMASK));
	t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_slli_epi64(p, 1), YMM_RIGHT_NMASK));
	t = _mm256_xor_si256(t, _mm256_slli_epi64(p, SIZE));
	t = _mm256_xor_si256(t, _mm256_srli_epi64(p, SIZE));
	t = _mm256_and_si256(t, YMM_TABLE_MASK);
	return _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(t, YMM_ZERO)));
	}
	#else
	const int simd_align = 32;
	const int simd_concurrency = 8;
	const ymm_table YMM_TABLE_MASK = _mm256_set1_epi32(TABLE_MASK);

	inline int simd32_validate(ymm_table p) {
	ymm_table t = YMM_TABLE_MASK;
	t = _mm256_xor_si256(t, p);
	t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_srli_epi32(p, 1), YMM_LEFT_NMASK));
	t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_slli_epi32(p, 1), YMM_RIGHT_NMASK));
	t = _mm256_xor_si256(t, _mm256_slli_epi32(p, SIZE));
	t = _mm256_xor_si256(t, _mm256_srli_epi32(p, SIZE));
	t = _mm256_and_si256(t, YMM_TABLE_MASK);
	return _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(t, YMM_ZERO)));
	}
	#endif
	#else
	bool validate(table p) {
	table t = TABLE_MASK;
	t ^= p;
	t ^= (p >> 1) & (~LEFT_MASK);
	t ^= (p << 1) & (~RIGHT_MASK);
	t ^= p << SIZE;
	t ^= p >> SIZE;
	return (t & TABLE_MASK) == 0;
	}
	#endif

	int main() {
	for (int i = 0; i < SIZE; i++) {
	RIGHT_MASK \|= BASE << (i * SIZE);
	LEFT_MASK \|= (BASE << (SIZE - 1)) << (i * SIZE);
	}
	#ifdef SIMD
	#ifdef S64
	YMM_LEFT_NMASK = _mm256_set1_epi64x(~LEFT_MASK);
	YMM_RIGHT_NMASK = _mm256_set1_epi64x(~RIGHT_MASK);
	#else
	YMM_LEFT_NMASK = _mm256_set1_epi32(~LEFT_MASK);
	YMM_RIGHT_NMASK = _mm256_set1_epi32(~RIGHT_MASK);
	#endif
	#pragma omp parallel for
	for (table i = 0; i < NUMBER_OF_PATTERN; i += simd_concurrency) {
	int m;
	#ifdef S64
	if ((m = simd64_validate(_mm256_set_epi64x(i, i + 1, i + 2, i + 3))) != 0) {
	#else
	if ((m = simd32_validate(_mm256_set_epi32(i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7))) != 0) {
	#endif
	for (int j = 0; j < simd_concurrency; j++) {
	if ((m & (1 << (simd_concurrency - 1 - j))) != 0) {
	#ifdef _OPENMP
	mtx.lock();
	#endif
	results.push_back(i + j);
	#ifdef _OPENMP
	mtx.unlock();
	#endif
	}
	}
	}
	}
	#else
	#pragma omp parallel for
	for (table i = 0; i < NUMBER_OF_PATTERN; i++) {
	if (validate(i)) {
	#ifdef _OPENMP
	mtx.lock();
	#endif
	results.push_back(i);
	#ifdef _OPENMP
	mtx.unlock();
	#endif
	}
	}
	#endif
	std::for_each(results.begin(), results.end(), [](table r)
	{
	for (int i = SIZE - 1; i >= 0; i--) {
	for (int j = SIZE - 1; j >= 0; j--) {
	if ((r & (BASE << (i * SIZE + j))) == 0)
	std::cout << "0";
	else
	std::cout << "1";
	}
	std::cout << std::endl;
	}
	std::cout << std::endl;
	});
	return 0;
	}