Skip to content

Instantly share code, notes, and snippets.

@sylvchev
Created April 29, 2016 06:41
Show Gist options
  • Save sylvchev/756959f972bfc53e914f1a4f85ad16d1 to your computer and use it in GitHub Desktop.
Save sylvchev/756959f972bfc53e914f1a4f85ad16d1 to your computer and use it in GitHub Desktop.
Simple CUDA file for benchmarking cuFFT plan with different signal size, inspired by https://github.com/drufat/cuda-examples
// Simple benchmarking FFT with Cuda, building on https://github.com/drufat/cuda-examples
// compiles with nvcc -DSIGNAL_SIZE=2048 -m64 -gencode arch=compute_30,code=sm_30 -O2 -use_fast_math -I commoninclude -l cufft cudafft.cu -o cudaff
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <cuda_runtime.h>
#include "helper_functions.h"
#include "helper_cuda.h"
#include <cufft.h>
void runTest(int argc, char** argv);
// Signal size
#ifndef SIGNAL_SIZE
#define SIGNAL_SIZE 512
#endif
int main(int argc, char** argv)
{
runTest(argc, argv);
}
//! Run and time a simple test for CUDA
void runTest(int argc, char** argv)
{
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
struct timeval start, stop;
double interval = 0.;
int iterations = 1000;
// Allocate host memory for the signal
float2* h_signal = (float2*)malloc(sizeof(float2) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
}
int mem_size = sizeof(float2) * SIGNAL_SIZE;
// Allocate device memory for signal
float2* d_signal;
cudaMalloc((void**)&d_signal, mem_size);
// Copy host memory to device
cudaMemcpy(d_signal, h_signal, mem_size,
cudaMemcpyHostToDevice);
// CUFFT plan
cufftHandle plan;
cufftPlan1d(&plan, mem_size, CUFFT_C2C, 1);
for (int i = -1; i < iterations; i++) {
//i == -1 -- warmup iteration
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
gettimeofday(&start, NULL);
}
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
}
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
gettimeofday(&stop, NULL);
double gpuTime = 0.001 * sdkGetTimerValue(&hTimer);
printf("Time elapsed for %d iterations : %.5f s (%g ms per iteration)\n", iterations, gpuTime, 1000.0*gpuTime/(double)iterations);
interval = (stop.tv_sec - start.tv_sec)*1000.0;
interval += (stop.tv_usec - start.tv_usec)/1000.0;
printf("Time elapsed for %d iterations: %g ms (%g ms per iterations)\n", iterations, interval, interval/(double)iterations);
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(h_signal);
cudaFree(d_signal);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment