joao-timescale · June 9, 2017 22:12
diff --git a/ctc_test.cu b/ctc_test.cu
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
 #include <cuda_runtime.h>
 #include "ctc.h"

 #define CTC_CHECK(expr)  abort_on_error((expr), __LINE__, __func__)
 #define CUDA_CHECK(expr) cuda_abort_on_error((expr), __LINE__, __func__)

 typedef struct ctcOptions ctcOptions;

 inline void abort_on_error(ctcStatus_t status, int line, const char * function_name)
 {
    if (status != CTC_STATUS_SUCCESS)
    {
        const char * error_msg = ctcGetStatusString(status);
        fprintf(stderr, "[%s:%d] CTC error: %s", function_name, line, error_msg);
        exit(-1);
    }
 }

 inline void cuda_abort_on_error(cudaError_t status, int line, const char * function_name)
 {
   if ( cudaSuccess != status )
   {
      const char * error_msg = cudaGetErrorString(status);
      fprintf(stderr, "[%s:%d] CUDA error: %s", function_name, line, error_msg);
      exit(-1);      
   }
 }

 int main()
 {
   const int alphabet_size = 5;
   const int minibatch_size = 3;
   const int num_activations = 3;

   // 2 five dimensional vectors linearized
   const size_t activations_size = alphabet_size * minibatch_size * num_activations * sizeof(float);
   float activations_base[45] = { 0, 0, 0, 0, 0,
                                  1, 2, 3, 4, 5,
                                  -5, -4, -3, -2, -1,
                                  0, 0, 0, 0, 0,
                                  6, 7, 8, 9, 10,
                                  -10, -9, -8, -7, -6,
                                  0, 0, 0, 0, 0,
                                  11, 12, 13, 14, 15,
                                  -15, -14, -13, -12, -11 };

   float * activations = &(activations_base[0]);

   const int labels[5] = { 1, 3, 3, 2, 3 };
   const int label_lengths[3] = { 1, 2, 2 };

   int lengths[3] = { 1, 3, 3 }; // length equivalent to minibatch_size

   // Setup CUDA stream to do computations
   cudaStream_t stream;
   CUDA_CHECK( cudaStreamCreate(&stream) );
   // Allocate GPU memory
   float * activations_gpu = NULL,
         * costs_gpu = NULL,
         * grads_gpu = NULL;

   CUDA_CHECK( cudaMalloc( &activations_gpu, activations_size ) );
   CUDA_CHECK( cudaMalloc( &costs_gpu, minibatch_size * sizeof(float) ) );
   CUDA_CHECK( cudaMalloc( &grads_gpu, activations_size ) );

   // Move activations data to the GPU
   CUDA_CHECK( cudaMemcpyAsync( activations_gpu, activations, activations_size,
      cudaMemcpyHostToDevice, stream ) );

   // Initialize warp-ctc options
   ctcOptions ctc_options;
   memset( &ctc_options, 0, sizeof(ctcOptions) );
   
   ctc_options.loc = CTC_GPU;
   ctc_options.stream = stream;

   // Compute CTC workspace size
   size_t gpu_alloc_bytes = 0;
   CTC_CHECK( get_workspace_size( &(label_lengths[0]), &(lengths[0]),
      alphabet_size, minibatch_size, ctc_options, &gpu_alloc_bytes ) );
   
   // Allocate CTC workspace
   void * ctc_gpu_workspace;
   CUDA_CHECK( cudaMalloc( &ctc_gpu_workspace, gpu_alloc_bytes ) );

   // compute CTC loss
   CTC_CHECK( compute_ctc_loss( activations_gpu, grads_gpu, &(labels[0]),
      &(label_lengths[0]), &(lengths[0]), alphabet_size, minibatch_size, costs_gpu,
      ctc_gpu_workspace, ctc_options) );

   // Move data from GPU to host
   float * grads_host = NULL,
         * costs_host = NULL;
   CUDA_CHECK( cudaMallocHost( &grads_host, activations_size ) );
   CUDA_CHECK( cudaMallocHost( &costs_host, minibatch_size * sizeof(float) ) );

   CUDA_CHECK( cudaMemcpyAsync( grads_host, grads_gpu, activations_size,
      cudaMemcpyDeviceToHost, stream ) );

   CUDA_CHECK( cudaMemcpyAsync( costs_host, costs_gpu, minibatch_size * sizeof(float),
      cudaMemcpyDeviceToHost, stream ) );

   // Synchronize stream to ensure data is copied
   CUDA_CHECK( cudaStreamSynchronize(stream) );

   fprintf( stdout, "Costs: [ " );
   for( size_t i = 0; i < minibatch_size; ++i )
   {
      fprintf( stdout, "%4.10g ", costs_host[i] );
   }
   fprintf(stdout, "]\n" );
   fprintf( stdout, "Gradients: [\n" );
   for( size_t k = 0; k < num_activations; ++k)
   {
        fprintf( stdout, "[" );
      for( size_t j = 0; j < minibatch_size; ++j)
      {
         fprintf( stdout, "[ " );
         for( size_t i = 0; i < alphabet_size; ++i )
         {
            fprintf( stdout, "%4.10g\t", grads_host[i + j*alphabet_size + k*minibatch_size*alphabet_size] );
         }
         fprintf(stdout, "]\n" );
      }
      fprintf( stdout, "]\n" );
   }
   fprintf(stdout, "]\n" );

   // Deallocate Host memory
   CUDA_CHECK( cudaFreeHost( grads_host ) );
   CUDA_CHECK( cudaFreeHost( costs_host ) );

   // Deallocate GPU memory
   CUDA_CHECK( cudaStreamDestroy(stream) );

   CUDA_CHECK( cudaFree( activations_gpu ) );
   CUDA_CHECK( cudaFree( costs_gpu ) );
   CUDA_CHECK( cudaFree( grads_gpu ) );

   CUDA_CHECK( cudaFree( ctc_gpu_workspace ) );

   return 0;
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <string.h>
	#include <cuda_runtime.h>
	#include "ctc.h"

	#define CTC_CHECK(expr) abort_on_error((expr), __LINE__, __func__)
	#define CUDA_CHECK(expr) cuda_abort_on_error((expr), __LINE__, __func__)

	typedef struct ctcOptions ctcOptions;

	inline void abort_on_error(ctcStatus_t status, int line, const char * function_name)
	{
	if (status != CTC_STATUS_SUCCESS)
	{
	const char * error_msg = ctcGetStatusString(status);
	fprintf(stderr, "[%s:%d] CTC error: %s", function_name, line, error_msg);
	exit(-1);
	}
	}

	inline void cuda_abort_on_error(cudaError_t status, int line, const char * function_name)
	{
	if ( cudaSuccess != status )
	{
	const char * error_msg = cudaGetErrorString(status);
	fprintf(stderr, "[%s:%d] CUDA error: %s", function_name, line, error_msg);
	exit(-1);
	}
	}

	int main()
	{
	const int alphabet_size = 5;
	const int minibatch_size = 3;
	const int num_activations = 3;

	// 2 five dimensional vectors linearized
	const size_t activations_size = alphabet_size * minibatch_size * num_activations * sizeof(float);
	float activations_base[45] = { 0, 0, 0, 0, 0,
	1, 2, 3, 4, 5,
	-5, -4, -3, -2, -1,
	0, 0, 0, 0, 0,
	6, 7, 8, 9, 10,
	-10, -9, -8, -7, -6,
	0, 0, 0, 0, 0,
	11, 12, 13, 14, 15,
	-15, -14, -13, -12, -11 };

	float * activations = &(activations_base[0]);

	const int labels[5] = { 1, 3, 3, 2, 3 };
	const int label_lengths[3] = { 1, 2, 2 };

	int lengths[3] = { 1, 3, 3 }; // length equivalent to minibatch_size

	// Setup CUDA stream to do computations
	cudaStream_t stream;
	CUDA_CHECK( cudaStreamCreate(&stream) );
	// Allocate GPU memory
	float * activations_gpu = NULL,
	* costs_gpu = NULL,
	* grads_gpu = NULL;

	CUDA_CHECK( cudaMalloc( &activations_gpu, activations_size ) );
	CUDA_CHECK( cudaMalloc( &costs_gpu, minibatch_size * sizeof(float) ) );
	CUDA_CHECK( cudaMalloc( &grads_gpu, activations_size ) );

	// Move activations data to the GPU
	CUDA_CHECK( cudaMemcpyAsync( activations_gpu, activations, activations_size,
	cudaMemcpyHostToDevice, stream ) );

	// Initialize warp-ctc options
	ctcOptions ctc_options;
	memset( &ctc_options, 0, sizeof(ctcOptions) );

	ctc_options.loc = CTC_GPU;
	ctc_options.stream = stream;

	// Compute CTC workspace size
	size_t gpu_alloc_bytes = 0;
	CTC_CHECK( get_workspace_size( &(label_lengths[0]), &(lengths[0]),
	alphabet_size, minibatch_size, ctc_options, &gpu_alloc_bytes ) );

	// Allocate CTC workspace
	void * ctc_gpu_workspace;
	CUDA_CHECK( cudaMalloc( &ctc_gpu_workspace, gpu_alloc_bytes ) );

	// compute CTC loss
	CTC_CHECK( compute_ctc_loss( activations_gpu, grads_gpu, &(labels[0]),
	&(label_lengths[0]), &(lengths[0]), alphabet_size, minibatch_size, costs_gpu,
	ctc_gpu_workspace, ctc_options) );

	// Move data from GPU to host
	float * grads_host = NULL,
	* costs_host = NULL;
	CUDA_CHECK( cudaMallocHost( &grads_host, activations_size ) );
	CUDA_CHECK( cudaMallocHost( &costs_host, minibatch_size * sizeof(float) ) );

	CUDA_CHECK( cudaMemcpyAsync( grads_host, grads_gpu, activations_size,
	cudaMemcpyDeviceToHost, stream ) );

	CUDA_CHECK( cudaMemcpyAsync( costs_host, costs_gpu, minibatch_size * sizeof(float),
	cudaMemcpyDeviceToHost, stream ) );

	// Synchronize stream to ensure data is copied
	CUDA_CHECK( cudaStreamSynchronize(stream) );

	fprintf( stdout, "Costs: [ " );
	for( size_t i = 0; i < minibatch_size; ++i )
	{
	fprintf( stdout, "%4.10g ", costs_host[i] );
	}
	fprintf(stdout, "]\n" );
	fprintf( stdout, "Gradients: [\n" );
	for( size_t k = 0; k < num_activations; ++k)
	{
	fprintf( stdout, "[" );
	for( size_t j = 0; j < minibatch_size; ++j)
	{
	fprintf( stdout, "[ " );
	for( size_t i = 0; i < alphabet_size; ++i )
	{
	fprintf( stdout, "%4.10g\t", grads_host[i + jalphabet_size + kminibatch_size*alphabet_size] );
	}
	fprintf(stdout, "]\n" );
	}
	fprintf( stdout, "]\n" );
	}
	fprintf(stdout, "]\n" );

	// Deallocate Host memory
	CUDA_CHECK( cudaFreeHost( grads_host ) );
	CUDA_CHECK( cudaFreeHost( costs_host ) );

	// Deallocate GPU memory
	CUDA_CHECK( cudaStreamDestroy(stream) );

	CUDA_CHECK( cudaFree( activations_gpu ) );
	CUDA_CHECK( cudaFree( costs_gpu ) );
	CUDA_CHECK( cudaFree( grads_gpu ) );

	CUDA_CHECK( cudaFree( ctc_gpu_workspace ) );

	return 0;
	}