Created
December 10, 2013 18:46
-
-
Save PirosB3/7895888 to your computer and use it in GitHub Desktop.
CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <math.h> | |
#define WARP_SIZE 16 | |
#define DEBUG false | |
/* ---------------- [[CUDA KERNELS]] ---------------- */ | |
__global__ void updateWeightsCUDA(float *weights, float *changes, float *delta_outputs, float *inputs, int n_inputs, int n_outputs) { | |
int width = n_outputs; | |
int height = n_inputs; | |
GlobalDim gd = getGlobalDim(blockDim, blockIdx, threadIdx); | |
if ((gd.x < width) && (gd.y < height)) { | |
int idx = width * gd.y + gd.x; | |
float change = delta_outputs[gd.x] * inputs[gd.y]; | |
weights[idx] += 0.5 * change + 0.5 * changes[idx]; | |
changes[idx] = change; | |
} | |
} | |
__global__ void mapStepCUDA(float *inputs, float *matrix, float *buffer, int width, int height) { | |
GlobalDim gd = getGlobalDim(blockDim, blockIdx, threadIdx); | |
if ((gd.x < width) && (gd.y < height)) { | |
int idx = width * gd.y + gd.x; | |
buffer[idx] = inputs[gd.y] * matrix[idx]; | |
} | |
} | |
__global__ void reduceStepCUDA(float *input, float *output, int width, int height) { | |
__shared__ float sharedMemory[WARP_SIZE * WARP_SIZE]; | |
// STEP 1: exclude all threads that do not depend from problem | |
GlobalDim gd = getGlobalDim(blockDim, blockIdx, threadIdx); | |
if ((gd.x < width) && (gd.y < height)) { | |
// STEP 2: Move to shared memory | |
int gridId = gd.y * width + gd.x; | |
int blockId = threadIdx.y * blockDim.x + threadIdx.x; | |
sharedMemory[blockId] = input[gridId]; | |
__syncthreads(); | |
int n = (int)ceil((float)blockDim.y/2); | |
while(n >= 1) { | |
if (threadIdx.y < n) { | |
if ((gd.y + n) < height) { | |
int firstIndex = blockId; | |
int secondIndex = blockDim.x * (threadIdx.y + n) + threadIdx.x; | |
sharedMemory[firstIndex] += sharedMemory[secondIndex]; | |
} | |
} | |
__syncthreads(); | |
if (n == 1) { | |
break; | |
} else { | |
n = (int)ceil((float)n/2); | |
} | |
} | |
__syncthreads(); | |
// STEP 3: Write back results | |
if (threadIdx.y == 1) { | |
output[blockIdx.y * width + gd.x] = sharedMemory[threadIdx.x]; | |
} | |
} | |
} | |
/* ---------------- [[LAUNCH FUNCTIONS]] ---------------- */ | |
void setWeightsForLayers(float *weights, float *changes, float *delta_outputs, float *inputs, int n_inputs, int n_outputs) { | |
// Copy to device memory | |
int grid_size = n_inputs * n_outputs; | |
float *weights_d = _copyHostDevice(weights, grid_size); | |
float *changes_d = _copyHostDevice(changes, grid_size); | |
float *delta_outputs_d = _copyHostDevice(delta_outputs, n_outputs); | |
float *inputs_d = _copyHostDevice(inputs, n_inputs); | |
// Define block structure | |
dim3 block(WARP_SIZE, WARP_SIZE); | |
dim3 grid = getGridBasedOnBlockSize(n_outputs, n_inputs, WARP_SIZE); | |
// RUN RUN RUN! | |
updateWeightsCUDA<<<grid, block>>>(weights_d, changes_d, delta_outputs_d, inputs_d, n_inputs, n_outputs); | |
// Copy back weights and momenutm | |
weights = _copyDeviceHost(weights_d, grid_size, weights); | |
changes = _copyDeviceHost(changes_d, grid_size, changes); | |
} | |
void update_layer(float *src_layer, float *dst_layer, int src_n, int dst_n, float *weights) { | |
dim3 block(WARP_SIZE, WARP_SIZE); | |
float *src_layer_d, *weights_d, *buffer_d; | |
int total = src_n * dst_n; | |
// Allocate input in global memory | |
src_layer_d = _copyHostDevice(src_layer, src_n); | |
weights_d = _copyHostDevice(weights, total); | |
cudaMalloc((void**)&buffer_d, sizeof(float) * total); | |
// Create block dimensions and run parallel update layer | |
int gridX = (int)ceil((float)dst_n/WARP_SIZE); | |
int gridY = (int)ceil((float)src_n/WARP_SIZE); | |
dim3 grid(gridX, gridY); | |
// RUN RUN RUN! | |
if (DEBUG) { | |
printf("\n***** Updating layer *****\n"); | |
printf("\nFrom\n"); | |
drawMatrix(src_layer, src_n, 1); | |
printf("\nTo\n"); | |
drawMatrix(weights, dst_n, src_n); | |
} | |
mapStepCUDA<<<grid, block>>>(src_layer_d, weights_d, buffer_d, dst_n, src_n); | |
// Set the current target to the input | |
float *currentTarget = buffer_d; | |
int currentHeight = src_n; | |
while (currentHeight > 1) { | |
// Calculate grid size | |
int gridX = (int)ceil((float)dst_n/WARP_SIZE); | |
int gridY = (int)ceil((float)currentHeight/WARP_SIZE); | |
dim3 grid(gridX, gridY); | |
// Allocate new buffer | |
float *buffer_d; | |
cudaMalloc((void**)&buffer_d, sizeof(float) * (dst_n * gridY)); | |
// RUN RUN RUN! | |
reduceStepCUDA<<<grid, block>>>(currentTarget, buffer_d, dst_n, currentHeight); | |
// Free old memory and keep track of the new one | |
cudaFree(currentTarget); | |
currentHeight = grid.y; | |
currentTarget = buffer_d; | |
} | |
dst_layer =_copyDeviceHost(currentTarget, dst_n, dst_layer); | |
for (int i=0; i < dst_n; i++) { | |
dst_layer[i] = tanh(dst_layer[i]); | |
} | |
if (DEBUG) { | |
printf("\nResult is\n"); | |
drawMatrix(dst_layer, dst_n, 1); | |
printf("\n***** ENDED UPDATING LAYER *****\n"); | |
_sleep(1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment