Last active
September 30, 2016 18:02
-
-
Save Mine02C4/c2a58fbf4dc1270166cb to your computer and use it in GitHub Desktop.
情報数学概論: Intel AVX2のSIMD命令を使ったLIGHTS OUTの全探索プログラム
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <vector> | |
#include <algorithm> | |
#include <mutex> | |
#define SIMD // AVX2を利用する。 | |
//#define S64 // テーブルを64bitにする。SIZEを6以上にするために必要 | |
#ifdef S64 | |
typedef int64_t table; | |
#else | |
typedef int table; | |
#endif | |
const int SIZE = 5; | |
const table BASE = 1; | |
const table N_BASE = -1; | |
const table NUMBER_OF_PATTERN = BASE << (SIZE * SIZE); | |
const table TABLE_MASK = ~(N_BASE << (SIZE * SIZE)); | |
table RIGHT_MASK = 0; | |
table LEFT_MASK = 0; | |
std::vector<table> results; | |
std::mutex mtx; | |
#ifdef SIMD | |
#include <emmintrin.h> | |
typedef __m256i ymm_table; | |
const ymm_table YMM_ZERO = _mm256_setzero_si256(); | |
ymm_table YMM_LEFT_NMASK; | |
ymm_table YMM_RIGHT_NMASK; | |
#ifdef S64 | |
const int simd_align = 64; | |
const int simd_concurrency = 4; | |
const ymm_table YMM_TABLE_MASK = _mm256_set1_epi64x(TABLE_MASK); | |
inline int simd64_validate(ymm_table p) { | |
ymm_table t = YMM_TABLE_MASK; | |
t = _mm256_xor_si256(t, p); | |
t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_srli_epi64(p, 1), YMM_LEFT_NMASK)); | |
t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_slli_epi64(p, 1), YMM_RIGHT_NMASK)); | |
t = _mm256_xor_si256(t, _mm256_slli_epi64(p, SIZE)); | |
t = _mm256_xor_si256(t, _mm256_srli_epi64(p, SIZE)); | |
t = _mm256_and_si256(t, YMM_TABLE_MASK); | |
return _mm256_movemask_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(t, YMM_ZERO))); | |
} | |
#else | |
const int simd_align = 32; | |
const int simd_concurrency = 8; | |
const ymm_table YMM_TABLE_MASK = _mm256_set1_epi32(TABLE_MASK); | |
inline int simd32_validate(ymm_table p) { | |
ymm_table t = YMM_TABLE_MASK; | |
t = _mm256_xor_si256(t, p); | |
t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_srli_epi32(p, 1), YMM_LEFT_NMASK)); | |
t = _mm256_xor_si256(t, _mm256_and_si256(_mm256_slli_epi32(p, 1), YMM_RIGHT_NMASK)); | |
t = _mm256_xor_si256(t, _mm256_slli_epi32(p, SIZE)); | |
t = _mm256_xor_si256(t, _mm256_srli_epi32(p, SIZE)); | |
t = _mm256_and_si256(t, YMM_TABLE_MASK); | |
return _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpeq_epi32(t, YMM_ZERO))); | |
} | |
#endif | |
#else | |
bool validate(table p) { | |
table t = TABLE_MASK; | |
t ^= p; | |
t ^= (p >> 1) & (~LEFT_MASK); | |
t ^= (p << 1) & (~RIGHT_MASK); | |
t ^= p << SIZE; | |
t ^= p >> SIZE; | |
return (t & TABLE_MASK) == 0; | |
} | |
#endif | |
int main() { | |
for (int i = 0; i < SIZE; i++) { | |
RIGHT_MASK |= BASE << (i * SIZE); | |
LEFT_MASK |= (BASE << (SIZE - 1)) << (i * SIZE); | |
} | |
#ifdef SIMD | |
#ifdef S64 | |
YMM_LEFT_NMASK = _mm256_set1_epi64x(~LEFT_MASK); | |
YMM_RIGHT_NMASK = _mm256_set1_epi64x(~RIGHT_MASK); | |
#else | |
YMM_LEFT_NMASK = _mm256_set1_epi32(~LEFT_MASK); | |
YMM_RIGHT_NMASK = _mm256_set1_epi32(~RIGHT_MASK); | |
#endif | |
#pragma omp parallel for | |
for (table i = 0; i < NUMBER_OF_PATTERN; i += simd_concurrency) { | |
int m; | |
#ifdef S64 | |
if ((m = simd64_validate(_mm256_set_epi64x(i, i + 1, i + 2, i + 3))) != 0) { | |
#else | |
if ((m = simd32_validate(_mm256_set_epi32(i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7))) != 0) { | |
#endif | |
for (int j = 0; j < simd_concurrency; j++) { | |
if ((m & (1 << (simd_concurrency - 1 - j))) != 0) { | |
#ifdef _OPENMP | |
mtx.lock(); | |
#endif | |
results.push_back(i + j); | |
#ifdef _OPENMP | |
mtx.unlock(); | |
#endif | |
} | |
} | |
} | |
} | |
#else | |
#pragma omp parallel for | |
for (table i = 0; i < NUMBER_OF_PATTERN; i++) { | |
if (validate(i)) { | |
#ifdef _OPENMP | |
mtx.lock(); | |
#endif | |
results.push_back(i); | |
#ifdef _OPENMP | |
mtx.unlock(); | |
#endif | |
} | |
} | |
#endif | |
std::for_each(results.begin(), results.end(), [](table r) | |
{ | |
for (int i = SIZE - 1; i >= 0; i--) { | |
for (int j = SIZE - 1; j >= 0; j--) { | |
if ((r & (BASE << (i * SIZE + j))) == 0) | |
std::cout << "0"; | |
else | |
std::cout << "1"; | |
} | |
std::cout << std::endl; | |
} | |
std::cout << std::endl; | |
}); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
5x5がCore i7-4702MQで実時間30ms-40ms前後で算出可能っぽい。もちろん実時間なので、同時に動作させている他のアプリケーションとの兼ね合いで大きな差が出る。ここまで短い時間だと使用する測定方法やプロセス起動の時間などが無視できなくなってくる。