Skip to content

Instantly share code, notes, and snippets.

@Laurae2
Created October 14, 2018 21:48
Show Gist options
  • Save Laurae2/6f7e2c43eed2ba0b94e822a05374a59c to your computer and use it in GitHub Desktop.
Save Laurae2/6f7e2c43eed2ba0b94e822a05374a59c to your computer and use it in GitHub Desktop.
Stream in R (brute force & slow version)
library(Rcpp)
Sys.setenv(PKG_CXXFLAGS = "-fopenmp -std=c++11")
cppFunction(code = '#include <Rcpp.h>
#include <omp.h>
Rcpp::NumericVector streamer(Rcpp::NumericVector threads, int triad_size = 2147483647, int repeats = 10, int repeats_alloc = 3, bool simd = false) {
int i;
int j;
double start_time;
double end_time;
double output_in = 0.0;
Rcpp::NumericVector output_out(4);
int nthreads = threads[0];
// int triad_size_parallel = triad_size / nthreads;
for (j=0; j<repeats_alloc; j++) {
{
start_time = omp_get_wtime();
std::vector<double> d(triad_size);
end_time = omp_get_wtime();
}
output_in = output_in + 1000 * (end_time - start_time);
}
output_in = output_in / repeats_alloc;
Rprintf("Sequential memory preallocation timing: %f ms\\n", output_in);
output_in = ((triad_size / (output_in / 1000)) * 8 / (1024 * 1024 * 1024));
Rprintf("Sequential memory preallocation speed: %f GBps\\n", output_in);
// if (nthreads > 1) {
//
// for (j=0; j<repeats_alloc; j++) {
//
// #pragma omp parallel for reduction(+:output_in) num_threads(nthreads)
// for (i=0; i<nthreads; i++) {
// {
// start_time = omp_get_wtime();
// std::vector<double> d(triad_size_parallel);
// end_time = omp_get_wtime();
// output_in = output_in + 1000 * (end_time - start_time);
// }
// }
//
// }
//
// output_in = output_in / repeats_alloc;
// Rprintf("Parallel memory preallocation timing: %f ms\\n", output_in);
// output_in = (((nthreads * triad_size_parallel) / (output_in / 1000)) * 8 / (1024 * 1024 * 1024));
// Rprintf("Parallel memory preallocation speed: %f GBps\\n", output_in);
//
// output_in = 0.0;
//
// }
output_in = 0.0;
std::vector<double> a(triad_size);
std::vector<double> b(triad_size);
double c = 3.0;
std::vector<double> d(triad_size);
for (j=0; j<repeats; j++) {
if (simd == true) {
start_time = omp_get_wtime();
#pragma omp parallel for simd shared(a, b) num_threads(nthreads)
for (i=0; i<triad_size; i++) {
a[i] = 1.0;
b[i] = 2.0;
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
} else {
start_time = omp_get_wtime();
#pragma omp parallel for shared(a, b) num_threads(nthreads)
for (i=0; i<triad_size; i++) {
a[i] = 1.0;
b[i] = 2.0;
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
}
}
output_in = output_in / repeats;
Rprintf("Memory filling timing: %f ms\\n", output_in);
output_in = 2 * ((triad_size / (output_in / 1000)) * 8 / (1024 * 1024 * 1024));
Rprintf("Memory filling speed: %f GBps\\n", output_in);
output_in = 0.0;
for (j=0; j<repeats; j++) {
if (simd == true) {
start_time = omp_get_wtime();
#pragma omp parallel for shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = a[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
} else {
start_time = omp_get_wtime();
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = a[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
}
}
output_in = output_in / repeats;
Rprintf("Stream COPY timing: %f ms\\n", output_in);
output_in = triad_size / (output_in / 1000) / 1000000000;
Rprintf("Stream COPY operation speed: %f (fake) GFLOPS\\n", output_in);
output_out[0] = output_in;
output_in = 0.0;
for (j=0; j<repeats; j++) {
if (simd == true) {
start_time = omp_get_wtime();
#pragma omp parallel for shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = c * a[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
} else {
start_time = omp_get_wtime();
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = c * a[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
}
}
output_in = output_in / repeats;
Rprintf("Stream SCALE timing: %f ms\\n", output_in);
output_in = triad_size / (output_in / 1000) / 1000000000;
Rprintf("Stream SCALE operation speed: %f GFLOPS\\n", output_in);
output_out[1] = output_in;
output_in = 0.0;
for (j=0; j<repeats; j++) {
if (simd == true) {
start_time = omp_get_wtime();
#pragma omp parallel for shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = a[i] + b[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
} else {
start_time = omp_get_wtime();
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = a[i] + b[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
}
}
output_in = output_in / repeats;
Rprintf("Stream SUM timing: %f ms\\n", output_in);
output_in = triad_size / (output_in / 1000) / 1000000000;
Rprintf("Stream SUM operation speed: %f GFLOPS\\n", output_in);
output_out[2] = output_in;
output_in = 0.0;
for (j=0; j<repeats; j++) {
if (simd == true) {
start_time = omp_get_wtime();
#pragma omp parallel for shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = a[i] + c * b[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
} else {
start_time = omp_get_wtime();
#pragma omp parallel for simd shared(a, b, d) num_threads(nthreads)
for (int i=0; i<triad_size; i++) {
d[i] = a[i] + c * b[i];
}
end_time = omp_get_wtime();
output_in = output_in + 1000 * (end_time - start_time);
}
}
output_in = output_in / repeats;
Rprintf("Stream TRIAD timing: %f ms\\n", output_in);
output_in = 2 * (triad_size / (output_in / 1000) / 1000000000);
Rprintf("Stream TRIAD operation speed: %f GFLOPS\\n", output_in);
output_out[3] = output_in;
return output_out;
}', verbose = TRUE, showOutput = TRUE, rebuild = TRUE)
streamer(1, 1000000, 1000, 1000, FALSE)
streamer(1, 1000000, 1000, 1000, TRUE)
streamer(32, 1000000, 1000, 1000, FALSE)
streamer(32, 1000000, 1000, 1000, TRUE)
streamer(64, 1000000, 1000, 1000, FALSE)
streamer(64, 1000000, 1000, 1000, TRUE)
streamer(1, 10000000, 100, 10, FALSE)
streamer(1, 10000000, 100, 10, TRUE)
streamer(32, 10000000, 100, 100, FALSE)
streamer(32, 10000000, 100, 10, TRUE)
streamer(64, 10000000, 100, 10, FALSE)
streamer(64, 10000000, 100, 10, TRUE)
streamer(1, 1000000000, 10, 3, FALSE)
streamer(1, 1000000000, 10, 3, TRUE)
streamer(32, 1000000000, 10, 3, FALSE)
streamer(32, 1000000000, 10, 3, TRUE)
streamer(64, 1000000000, 10, 3, FALSE)
streamer(64, 1000000000, 10, 3, TRUE)
streamer(1, 2147483647, 10, 3, FALSE)
streamer(1, 2147483647, 10, 3, TRUE)
streamer(32, 2147483647, 10, 3, FALSE)
streamer(32, 2147483647, 10, 3, TRUE)
streamer(64, 2147483647, 10, 3, FALSE)
streamer(64, 2147483647, 10, 3, TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment