Created
March 31, 2017 15:22
-
-
Save kaityo256/76b2c0e74e3991cee04abe2a08753f25 to your computer and use it in GitHub Desktop.
sample code using both v4df and OpenMP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <immintrin.h> | |
| #include <iostream> | |
| #include <fstream> | |
| #include <random> | |
| #include <math.h> | |
| #include <assert.h> | |
| #include <sys/stat.h> | |
| #include <sys/time.h> | |
| const double density = 1.0; | |
| const int N = 400000; | |
| const int MAX_PAIRS = 100 * N; | |
| double L = 50.0; | |
| const double dt = 0.001; | |
| const int D = 4; | |
| enum {X, Y, Z}; | |
| double q[N][D]; | |
| double p[N][D]; | |
| int particle_number = 0; | |
| int number_of_pairs = 0; | |
| int number_of_partners[N]; | |
| int i_particles[MAX_PAIRS]; | |
| int j_particles[MAX_PAIRS]; | |
| int pointer[N], pointer2[N]; | |
| int sorted_list[MAX_PAIRS]; | |
| const double CUTOFF_LENGTH = 3.0; | |
| const double SEARCH_LENGTH = 3.3; | |
| const double CL2 = CUTOFF_LENGTH * CUTOFF_LENGTH; | |
| //---------------------------------------------------------------------- | |
| typedef double v4df __attribute__((vector_size(32))); | |
| //---------------------------------------------------------------------- | |
| void | |
| add_particle(double x, double y, double z) { | |
| static std::mt19937 mt(2); | |
| std::uniform_real_distribution<double> ud(0.0, 0.1); | |
| q[particle_number][X] = x + ud(mt); | |
| q[particle_number][Y] = y + ud(mt); | |
| q[particle_number][Z] = z + ud(mt); | |
| particle_number++; | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| register_pair(int index1, int index2) { | |
| int i, j; | |
| if (index1 < index2) { | |
| i = index1; | |
| j = index2; | |
| } else { | |
| i = index2; | |
| j = index1; | |
| } | |
| i_particles[number_of_pairs] = i; | |
| j_particles[number_of_pairs] = j; | |
| number_of_partners[i]++; | |
| number_of_pairs++; | |
| i_particles[number_of_pairs] = j; | |
| j_particles[number_of_pairs] = i; | |
| number_of_partners[j]++; | |
| number_of_pairs++; | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| sortpair(void) { | |
| const int pn = particle_number; | |
| int pos = 0; | |
| pointer[0] = 0; | |
| for (int i = 0; i < pn - 1; i++) { | |
| pos += number_of_partners[i]; | |
| pointer[i + 1] = pos; | |
| } | |
| for (int i = 0; i < pn; i++) { | |
| pointer2[i] = 0; | |
| } | |
| const int s = number_of_pairs; | |
| for (int k = 0; k < s; k++) { | |
| int i = i_particles[k]; | |
| int j = j_particles[k]; | |
| int index = pointer[i] + pointer2[i]; | |
| sorted_list[index] = j; | |
| pointer2[i] ++; | |
| } | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| makepair(void) { | |
| const double SL2 = SEARCH_LENGTH * SEARCH_LENGTH; | |
| const int pn = particle_number; | |
| for (int i = 0; i < pn; i++) { | |
| number_of_partners[i] = 0; | |
| } | |
| for (int i = 0; i < particle_number - 1; i++) { | |
| for (int j = i + 1; j < particle_number; j++) { | |
| const double dx = q[i][X] - q[j][X]; | |
| const double dy = q[i][Y] - q[j][Y]; | |
| const double dz = q[i][Z] - q[j][Z]; | |
| const double r2 = dx * dx + dy * dy + dz * dz; | |
| if (r2 < SL2) { | |
| register_pair(i, j); | |
| } | |
| } | |
| } | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| init(void) { | |
| const double s = 1.0 / pow(density * 0.25, 1.0 / 3.0); | |
| const double hs = s * 0.5; | |
| int sx = static_cast<int>(L / s); | |
| int sy = static_cast<int>(L / s); | |
| int sz = static_cast<int>(L / s); | |
| for (int iz = 0; iz < sz; iz++) { | |
| for (int iy = 0; iy < sy; iy++) { | |
| for (int ix = 0; ix < sx; ix++) { | |
| double x = ix * s; | |
| double y = iy * s; | |
| double z = iz * s; | |
| add_particle(x , y , z); | |
| add_particle(x , y + hs, z + hs); | |
| add_particle(x + hs , y , z + hs); | |
| add_particle(x + hs , y + hs, z); | |
| } | |
| } | |
| } | |
| for (int i = 0; i < particle_number; i++) { | |
| p[i][X] = 0.0; | |
| p[i][Y] = 0.0; | |
| p[i][Z] = 0.0; | |
| } | |
| } | |
| //---------------------------------------------------------------------- | |
| __attribute__((noinline)) | |
| void | |
| mdacp_intrin_reactless() { | |
| #pragma omp parallel | |
| { | |
| const V4DF vc24 = _mm256_set1_pd(24.0 * dt); | |
| const V4DF vc48 = _mm256_set1_pd(48.0 * dt); | |
| const V4DF vcl2 = _mm256_set1_pd(CL2); | |
| const V4DF vzero = _mm256_setzero_pd(); | |
| const int pn = particle_number; | |
| #pragma omp for nowait | |
| for (int i = 0; i < pn; i++) { | |
| const V4DF vqi = _mm256_loadu_pd((double*)(q + i)); | |
| V4DF vpi = _mm256_loadu_pd((double*)(p + i)); | |
| const int np = number_of_partners[i]; | |
| const int kp = pointer[i]; | |
| for (int k = 0; k < (np / 4) * 4; k += 4) { | |
| const int j_a = sorted_list[kp + k]; | |
| const V4DF vqj_a = _mm256_loadu_pd((double*)(q + j_a)); | |
| V4DF vdq_a = vqj_a - vqi; | |
| const int j_b = sorted_list[kp + k + 1]; | |
| const V4DF vqj_b = _mm256_loadu_pd((double*)(q + j_b)); | |
| V4DF vdq_b = vqj_b - vqi; | |
| const int j_c = sorted_list[kp + k + 2]; | |
| const V4DF vqj_c = _mm256_loadu_pd((double*)(q + j_c)); | |
| V4DF vdq_c = vqj_c - vqi; | |
| const int j_d = sorted_list[kp + k + 3]; | |
| const V4DF vqj_d = _mm256_loadu_pd((double*)(q + j_d)); | |
| V4DF vdq_d = vqj_d - vqi; | |
| V4DF tmp0 = _mm256_unpacklo_pd(vdq_a, vdq_b); | |
| V4DF tmp1 = _mm256_unpackhi_pd(vdq_a, vdq_b); | |
| V4DF tmp2 = _mm256_unpacklo_pd(vdq_c, vdq_d); | |
| V4DF tmp3 = _mm256_unpackhi_pd(vdq_c, vdq_d); | |
| V4DF vdx = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); | |
| V4DF vdy = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); | |
| V4DF vdz = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); | |
| V4DF vr2 = vdx * vdx + vdy * vdy + vdz * vdz; | |
| V4DF vr6 = vr2 * vr2 * vr2; | |
| V4DF vdf = (vc24 * vr6 - vc48) / (vr6 * vr6 * vr2); | |
| V4DF mask = vcl2 - vr2; | |
| vdf = _mm256_blendv_pd(vdf, vzero, mask); | |
| V4DF vdf_a = _mm256_permute4x64_pd(vdf, 0); | |
| V4DF vdf_b = _mm256_permute4x64_pd(vdf, 85); | |
| V4DF vdf_c = _mm256_permute4x64_pd(vdf, 170); | |
| V4DF vdf_d = _mm256_permute4x64_pd(vdf, 255); | |
| vpi += vdq_a * vdf_a; | |
| vpi += vdq_b * vdf_b; | |
| vpi += vdq_c * vdf_c; | |
| vpi += vdq_d * vdf_d; | |
| } | |
| _mm256_storeu_pd((double*)(p + i), vpi); | |
| double pfx = 0.0, pfy = 0.0, pfz = 0.0; | |
| const double qix = q[i][X], qiy = q[i][Y], qiz = q[i][Z]; | |
| for (int k = (np / 4) * 4; k < np; k++) { | |
| const int j = sorted_list[kp + k]; | |
| const double dx = q[j][X] - qix; | |
| const double dy = q[j][Y] - qiy; | |
| const double dz = q[j][Z] - qiz; | |
| const double r2 = (dx * dx + dy * dy + dz * dz); | |
| const double r6 = r2 * r2 * r2; | |
| double df = ((24.0 * r6 - 48.0) / (r6 * r6 * r2)) * dt; | |
| if (r2 > CL2) df = 0.0; | |
| pfx += df * dx; | |
| pfy += df * dy; | |
| pfz += df * dz; | |
| } | |
| p[i][X] += pfx; | |
| p[i][Y] += pfy; | |
| p[i][Z] += pfz; | |
| } | |
| } | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| loadpair(void) { | |
| std::ifstream ifs("pair.dat", std::ios::binary); | |
| ifs.read((char*)&number_of_pairs, sizeof(int)); | |
| ifs.read((char*)number_of_partners, sizeof(int)*N); | |
| ifs.read((char*)i_particles, sizeof(int)*MAX_PAIRS); | |
| ifs.read((char*)j_particles, sizeof(int)*MAX_PAIRS); | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| savepair(void) { | |
| makepair(); | |
| std::ofstream ofs("pair.dat", std::ios::binary); | |
| ofs.write((char*)&number_of_pairs, sizeof(int)); | |
| ofs.write((char*)number_of_partners, sizeof(int)*N); | |
| ofs.write((char*)i_particles, sizeof(int)*MAX_PAIRS); | |
| ofs.write((char*)j_particles, sizeof(int)*MAX_PAIRS); | |
| } | |
| //---------------------------------------------------------------------- | |
| void | |
| print_result(void) { | |
| for (int i = 0; i < 5; i++) { | |
| printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]); | |
| } | |
| for (int i = particle_number - 5; i < particle_number; i++) { | |
| printf("%.10f %.10f %.10f\n", p[i][X], p[i][Y], p[i][Z]); | |
| } | |
| } | |
| //---------------------------------------------------------------------- | |
| int | |
| main(void) { | |
| init(); | |
| struct stat st; | |
| int ret = stat("pair.dat", &st); | |
| if (ret == 0) { | |
| std::cerr << "A pair-file is found. I use it." << std::endl; | |
| loadpair(); | |
| } else { | |
| std::cerr << "Make pairlist." << std::endl; | |
| savepair(); | |
| } | |
| std::cerr << "Number of pairs: " << number_of_pairs << std::endl; | |
| sortpair(); | |
| mdacp_intrin_reactless(); | |
| print_result(); | |
| } | |
| //---------------------------------------------------------------------- |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TARGET=icpc_v4df.out icpc_auto.out gcc_v4df.out gcc_auto.out
ICCOPT=-O3 -xHOST -std=c++11 -qopenmp
GCCOPT=-O3 -march=native -std=c++11 -fopenmp
all: $(TARGET)
icpc_auto.out: test.cpp$(ICCOPT) -DV4DF=auto $ < -o $@
icpc
icpc_v4df.out: test.cpp$(ICCOPT) -DV4DF=v4df $ < -o $@
icpc
gcc_auto.out: test.cpp$(GCCOPT) -DV4DF=auto $ < -o $@
g++
gcc_v4df.out: test.cpp$(GCCOPT) -DV4DF=v4df $ < -o $@
g++
clean:
rm -f $(TARGET) *.txt
test: $(TARGET)
./icpc_v4df.out > icpc_v4df.txt
./icpc_auto.out > icpc_auto.txt
./gcc_v4df.out > gcc_v4df.txt
./gcc_auto.out > gcc_auto.txt
diff gcc_v4df.txt gcc_auto.txt
diff icpc_auto.txt gcc_auto.txt
diff icpc_v4df.txt gcc_auto.txt