Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active October 10, 2020 00:30
Show Gist options
  • Save sandeepkumar-skb/a0a3780e82c0d010d9e376980e118e6e to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/a0a3780e82c0d010d9e376980e118e6e to your computer and use it in GitHub Desktop.
Multi-Streaming Experiments
#include <stdio.h>
#include <thread>
#include <chrono>
#include <iostream>
const int N = 1 << 20;
__global__ void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
void launch_kernel()
{
float *data;
cudaMalloc(&data, N * sizeof(float));
kernel<<<1, 64>>>(data, N);
cudaStreamSynchronize(0);
}
int main()
{
const int num_threads = 8;
std::array<std::thread, num_threads> workers;
std::chrono::high_resolution_clock::time_point gpu_start;
std::chrono::high_resolution_clock::time_point gpu_end ;
std::chrono::duration<double> gpu_span;
int count = 100;
while(count > 0){
gpu_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_threads; i++) {
workers[i] = std::thread(launch_kernel);
}
for (int i = 0; i < num_threads; i++) {
workers[i].join();
}
cudaDeviceReset();
gpu_end = std::chrono::high_resolution_clock::now();
gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
--count;
}
return 0;
}
#include <chrono>
#include <iostream>
const int N = 1 << 20;
__global__ void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
int main()
{
const int num_streams = 8;
cudaStream_t streams[num_streams];
float *data[num_streams];
std::chrono::high_resolution_clock::time_point gpu_start;
std::chrono::high_resolution_clock::time_point gpu_end ;
std::chrono::duration<double> gpu_span;
int count = 25;
while(count >0){
gpu_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_streams; i++) {
//cudaStreamCreate(&streams[i]);
cudaMalloc(&data[i], N * sizeof(float));
// launch one worker kernel per stream
//kernel<<<1, 64, 0, streams[i]>>>(data[i], N);
kernel<<<1, 64 >>>(data[i], N);
// launch a dummy kernel on the default stream
//kernel<<<1, 1>>>(0, 0);
}
cudaDeviceReset();
gpu_end = std::chrono::high_resolution_clock::now();
gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
--count;
}
return 0;
}
#include <chrono>
#include <iostream>
const int N = 1 << 20;
__global__ void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
int main()
{
const int num_streams = 8;
cudaStream_t streams[num_streams];
float *data[num_streams];
std::chrono::high_resolution_clock::time_point gpu_start;
std::chrono::high_resolution_clock::time_point gpu_end ;
std::chrono::duration<double> gpu_span;
int count = 25;
while(count >0){
gpu_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&data[i], N * sizeof(float));
// launch one worker kernel per stream
kernel<<<1, 64, 0, streams[i]>>>(data[i], N);
//kernel<<<1, 64 >>>(data[i], N);
// launch a dummy kernel on the default stream
//kernel<<<1, 1>>>(0, 0);
}
cudaDeviceReset();
gpu_end = std::chrono::high_resolution_clock::now();
gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
--count;
}
return 0;
}
@sandeepkumar-skb
Copy link
Author

Single Thread and Default Stream

  • compile: nvcc -std=c++14 pointwise_single_thread_single_stream.cu -o pointwise_single_thread_single_stream
  • results: 424.387ms

Single Thread and Multiple Stream

  • compile: nvcc -std=c++14 pointwise_single_thread_multi_stream.cu -o pointwise_single_thread_multi_stream
  • results: 122.329ms

Multi Thread and Single Stream

  • compile: nvcc -std=c++14 pointwise_multi_thread_multi_stream.cu -o pointwise_multi_thread_multi_stream
  • results: 346.042ms

Multi Thread and Multi Stream

  • compile: nvcc -std=c++14 --default-stream per-thread pointwise_multi_thread_multi_stream.cu -o pointwise_multi_thread_multi_stream
  • results: 46.4255ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment