Created
October 14, 2018 21:48
-
-
Save Laurae2/6f7e2c43eed2ba0b94e822a05374a59c to your computer and use it in GitHub Desktop.
Stream in R (brute force & slow version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(Rcpp) | |
Sys.setenv(PKG_CXXFLAGS = "-fopenmp -std=c++11") | |
cppFunction(code = '#include <Rcpp.h> | |
#include <omp.h> | |
Rcpp::NumericVector streamer(Rcpp::NumericVector threads, int triad_size = 2147483647, int repeats = 10, int repeats_alloc = 3, bool simd = false) { | |
int i; | |
int j; | |
double start_time; | |
double end_time; | |
double output_in = 0.0; | |
Rcpp::NumericVector output_out(4); | |
int nthreads = threads[0]; | |
// int triad_size_parallel = triad_size / nthreads; | |
for (j=0; j<repeats_alloc; j++) { | |
{ | |
start_time = omp_get_wtime(); | |
std::vector<double> d(triad_size); | |
end_time = omp_get_wtime(); | |
} | |
output_in = output_in + 1000 * (end_time - start_time); | |
} | |
output_in = output_in / repeats_alloc; | |
Rprintf("Sequential memory preallocation timing: %f ms\\n", output_in); | |
output_in = ((triad_size / (output_in / 1000)) * 8 / (1024 * 1024 * 1024)); | |
Rprintf("Sequential memory preallocation speed: %f GBps\\n", output_in); | |
// if (nthreads > 1) { | |
// | |
// for (j=0; j<repeats_alloc; j++) { | |
// | |
// #pragma omp parallel for reduction(+:output_in) num_threads(nthreads) | |
// for (i=0; i<nthreads; i++) { | |
// { | |
// start_time = omp_get_wtime(); | |
// std::vector<double> d(triad_size_parallel); | |
// end_time = omp_get_wtime(); | |
// output_in = output_in + 1000 * (end_time - start_time); | |
// } | |
// } | |
// | |
// } | |
// | |
// output_in = output_in / repeats_alloc; | |
// Rprintf("Parallel memory preallocation timing: %f ms\\n", output_in); | |
// output_in = (((nthreads * triad_size_parallel) / (output_in / 1000)) * 8 / (1024 * 1024 * 1024)); | |
// Rprintf("Parallel memory preallocation speed: %f GBps\\n", output_in); | |
// | |
// output_in = 0.0; | |
// | |
// } | |
output_in = 0.0; | |
std::vector<double> a(triad_size); | |
std::vector<double> b(triad_size); | |
double c = 3.0; | |
std::vector<double> d(triad_size); | |
for (j=0; j<repeats; j++) { | |
if (simd == true) { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for simd shared(a, b) num_threads(nthreads) | |
for (i=0; i<triad_size; i++) { | |
a[i] = 1.0; | |
b[i] = 2.0; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} else { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for shared(a, b) num_threads(nthreads) | |
for (i=0; i<triad_size; i++) { | |
a[i] = 1.0; | |
b[i] = 2.0; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} | |
} | |
output_in = output_in / repeats; | |
Rprintf("Memory filling timing: %f ms\\n", output_in); | |
output_in = 2 * ((triad_size / (output_in / 1000)) * 8 / (1024 * 1024 * 1024)); | |
Rprintf("Memory filling speed: %f GBps\\n", output_in); | |
output_in = 0.0; | |
for (j=0; j<repeats; j++) { | |
if (simd == true) { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = a[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} else { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = a[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} | |
} | |
output_in = output_in / repeats; | |
Rprintf("Stream COPY timing: %f ms\\n", output_in); | |
output_in = triad_size / (output_in / 1000) / 1000000000; | |
Rprintf("Stream COPY operation speed: %f (fake) GFLOPS\\n", output_in); | |
output_out[0] = output_in; | |
output_in = 0.0; | |
for (j=0; j<repeats; j++) { | |
if (simd == true) { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = c * a[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} else { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = c * a[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} | |
} | |
output_in = output_in / repeats; | |
Rprintf("Stream SCALE timing: %f ms\\n", output_in); | |
output_in = triad_size / (output_in / 1000) / 1000000000; | |
Rprintf("Stream SCALE operation speed: %f GFLOPS\\n", output_in); | |
output_out[1] = output_in; | |
output_in = 0.0; | |
for (j=0; j<repeats; j++) { | |
if (simd == true) { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = a[i] + b[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} else { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = a[i] + b[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} | |
} | |
output_in = output_in / repeats; | |
Rprintf("Stream SUM timing: %f ms\\n", output_in); | |
output_in = triad_size / (output_in / 1000) / 1000000000; | |
Rprintf("Stream SUM operation speed: %f GFLOPS\\n", output_in); | |
output_out[2] = output_in; | |
output_in = 0.0; | |
for (j=0; j<repeats; j++) { | |
if (simd == true) { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = a[i] + c * b[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} else { | |
start_time = omp_get_wtime(); | |
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads) | |
for (int i=0; i<triad_size; i++) { | |
d[i] = a[i] + c * b[i]; | |
} | |
end_time = omp_get_wtime(); | |
output_in = output_in + 1000 * (end_time - start_time); | |
} | |
} | |
output_in = output_in / repeats; | |
Rprintf("Stream TRIAD timing: %f ms\\n", output_in); | |
output_in = 2 * (triad_size / (output_in / 1000) / 1000000000); | |
Rprintf("Stream TRIAD operation speed: %f GFLOPS\\n", output_in); | |
output_out[3] = output_in; | |
return output_out; | |
}', verbose = TRUE, showOutput = TRUE, rebuild = TRUE) | |
streamer(1, 1000000, 1000, 1000, FALSE) | |
streamer(1, 1000000, 1000, 1000, TRUE) | |
streamer(32, 1000000, 1000, 1000, FALSE) | |
streamer(32, 1000000, 1000, 1000, TRUE) | |
streamer(64, 1000000, 1000, 1000, FALSE) | |
streamer(64, 1000000, 1000, 1000, TRUE) | |
streamer(1, 10000000, 100, 10, FALSE) | |
streamer(1, 10000000, 100, 10, TRUE) | |
streamer(32, 10000000, 100, 100, FALSE) | |
streamer(32, 10000000, 100, 10, TRUE) | |
streamer(64, 10000000, 100, 10, FALSE) | |
streamer(64, 10000000, 100, 10, TRUE) | |
streamer(1, 1000000000, 10, 3, FALSE) | |
streamer(1, 1000000000, 10, 3, TRUE) | |
streamer(32, 1000000000, 10, 3, FALSE) | |
streamer(32, 1000000000, 10, 3, TRUE) | |
streamer(64, 1000000000, 10, 3, FALSE) | |
streamer(64, 1000000000, 10, 3, TRUE) | |
streamer(1, 2147483647, 10, 3, FALSE) | |
streamer(1, 2147483647, 10, 3, TRUE) | |
streamer(32, 2147483647, 10, 3, FALSE) | |
streamer(32, 2147483647, 10, 3, TRUE) | |
streamer(64, 2147483647, 10, 3, FALSE) | |
streamer(64, 2147483647, 10, 3, TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment