Skip to content

Instantly share code, notes, and snippets.

@PirosB3
Last active December 27, 2015 04:19
Show Gist options
  • Save PirosB3/7266409 to your computer and use it in GitHub Desktop.
Save PirosB3/7266409 to your computer and use it in GitHub Desktop.
NEAURAL NETS NEARLY IN PARALLEL
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef struct {
int n_inputs;
int n_hidden;
int n_outputs;
float *out_input;
float *out_hidden;
float *out_output;
float **changes_input_hidden;
float **changes_hidden_output;
float **w_input_hidden;
float **w_hidden_output;
} NeuralNet;
typedef struct {
int *result;
int *data;
} Pattern;
void buildLayer(float *arr, int n, float initial) {
int i=0;
while(i < n){
*arr = initial;
arr++;
i++;
}
}
float** buildWeightsLayer(int outer_n, int inner_n, float seed) {
int i;
int total = outer_n * inner_n;
float *data = (float*) malloc(sizeof(float) * total);
for(i=0; i < total; i++) {
if (seed == -1) {
data[i] = (float)rand()/(float)RAND_MAX;
} else {
data[i] = seed;
}
}
float **ptr_arr = (float **)malloc(sizeof(float*) * outer_n);
for(i=0; i < outer_n; i++) {
ptr_arr[i] = data + (i* inner_n);
}
return ptr_arr;
}
__global__ void calculateOutputForLayer(float *inputs, float *weight_matrix, int n_inputs, int n_outputs, float *output_matrix) {
output_matrix[n_outputs * threadIdx.y + threadIdx.x] = inputs[threadIdx.y] * weight_matrix[n_outputs * threadIdx.y + threadIdx.x];
}
__global__ void doParallelReduction(float *d_matrix, int n_inputs, int n_outputs) {
// Specify first reduction size
float n = (float)n_inputs;
while(n > 1) {
n = ceil(n/2.0f);
int first_element = n_outputs * threadIdx.y + threadIdx.x;
int second_element = n_outputs * (threadIdx.y + n) + threadIdx.x;
if (threadIdx.y < n) {
d_matrix[first_element] += d_matrix[second_element];
printf("Summing %i with %i. res is: %f\n", first_element, second_element, d_matrix[first_element]);
}
__syncthreads();
printf("Done batch for %f\n", n);
}
}
int mainxx() {
// PARALLEL REDUCTION
int n_inputs = 3;
int n_outputs = 3;
// Build output matrix and copy
float matrix[27] = {1,2,3,2,3,3,3,1,2};
float *d_matrix;
cudaMalloc((void**)&d_matrix, sizeof(float) * 9);
cudaMemcpy(d_matrix, matrix, sizeof(float) * 9, cudaMemcpyHostToDevice);
dim3 block(n_outputs, n_inputs / 2);
doParallelReduction<<<1, block>>>(d_matrix, n_inputs, n_outputs);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
float *res = (float *)malloc(sizeof(float) * 9);
cudaMemcpy(res, d_matrix, sizeof(float) * 9, cudaMemcpyDeviceToHost);
for(int i=0; i < 3; i++) {
printf("%f\n", res[i]);
}
return 0;
}
int main4() {
// Build inputs and weights
int n_inputs = 4;
int n_outputs = 2;
float inputs[4] = {1,2,3,4};
float weights[8] = {2,4,2,2,2,7,2,2};
// Build output matrix
float *d_output;
cudaMalloc((void**)&d_output, sizeof(float) * n_inputs * n_outputs);
// Copy layers
float *d_inputs;
cudaMalloc((void**)&d_inputs, sizeof(float) * 4);
cudaMemcpy(d_inputs, &inputs, sizeof(float) * 4, cudaMemcpyHostToDevice);
float *d_weights;
cudaMalloc((void**)&d_weights, sizeof(float) * 8);
cudaMemcpy(d_weights, &weights, sizeof(float) * 8, cudaMemcpyHostToDevice);
// Run compute kernel
dim3 block(n_outputs, n_inputs);
calculateOutputForLayer<<<1, block>>>(d_inputs, d_weights, n_inputs, n_outputs, d_output);
// Copy back output
float *output = (float *)malloc(sizeof(float) * n_inputs * n_outputs);
cudaMemcpy(output, d_output, sizeof(float) * n_inputs * n_outputs, cudaMemcpyDeviceToHost);
for (int i=0; i < n_inputs * n_outputs; i++) {
// printf("%f\n", output[i]);
}
cudaDeviceReset();
return 0;
}
NeuralNet buildNeuralNet(int n_inputs, int n_outputs, int n_hidden) {
float *out_input = (float *)malloc(sizeof(float) * (n_inputs + 1));
float *out_hidden = (float *)malloc(sizeof(float) * n_hidden);
float *out_output = (float *)malloc(sizeof(float) * n_outputs);
buildLayer(out_input, n_inputs + 1, 1.0f);
buildLayer(out_hidden, n_hidden, 1.0f);
buildLayer(out_output, n_outputs, 1.0f);
// Build changes layer
float **changes_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, 0.0f);
float **changes_hidden_output = buildWeightsLayer(n_hidden, n_outputs, 0.0f);
// Build weight matrix
float **w_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, -1.0f);
float **w_hidden_output = buildWeightsLayer(n_hidden, n_outputs, -1.0f);
NeuralNet nn;
nn.n_inputs = n_inputs + 1;
nn.n_outputs = n_outputs;
nn.n_hidden = n_hidden;
nn.out_input = out_input;
nn.out_hidden = out_hidden;
nn.out_output = out_output;
nn.changes_input_hidden = changes_input_hidden;
nn.changes_hidden_output = changes_hidden_output;
nn.w_input_hidden = w_input_hidden;
nn.w_hidden_output = w_hidden_output;
return nn;
}
float *update_pattern(Pattern pattern, NeuralNet nn) {
// Write inputs
int i, j;
float weighted_sum;
for(i=0; i < nn.n_inputs -1; i++) {
nn.out_input[i] = pattern.data[i];
}
// Write hidden
for(i=0; i < nn.n_hidden; i++) {
weighted_sum = 0.0f;
for(j=0; j < nn.n_inputs; j++) {
weighted_sum += nn.out_input[j] * nn.w_input_hidden[j][i];
}
nn.out_hidden[i] = tanh(weighted_sum);
//printf("Hidden is : %f\n", nn.out_hidden[i]);
}
// Write output
for(i=0; i < nn.n_outputs; i++) {
weighted_sum = 0.0f;
for(j=0; j < nn.n_hidden; j++) {
weighted_sum += nn.out_hidden[j] * nn.w_hidden_output[j][i];
}
nn.out_output[i] = tanh(weighted_sum);
//printf("Output is : %f\n", nn.out_output[i]);
}
return nn.out_output;
}
float dsigmoid(float y) {
return 1.0 - pow(y,2.0f);
}
float back_propagate_network(Pattern p, NeuralNet n) {
// Calculate deltas
int i, j;
float *output_delta =(float *) malloc(sizeof(float) * n.n_outputs);
float *hidden_delta = (float *) malloc(sizeof(float) * n.n_hidden);
// Calculate output delta
for (i=0; i < n.n_outputs; i++) {
float error = p.result[i] - n.out_output[i];
output_delta[i] = dsigmoid(n.out_output[i]) * error;
}
// Calculate hidden delta
for(i=0; i < n.n_hidden; i++) {
float error = 0.0f;
for (j=0; j < n.n_outputs; j++) {
error += output_delta[j] * n.w_hidden_output[i][j];
}
hidden_delta[i] = dsigmoid(n.out_hidden[i]) * error;
}
// Set hidden-output weights
for(i=0; i < n.n_hidden; i++) {
for (j=0; j < n.n_outputs; j++) {
float change = output_delta[j] * n.out_hidden[i];
n.w_hidden_output[i][j] += 0.5 * change + 0.5 * n.changes_hidden_output[i][j];
n.changes_hidden_output[i][j] = change;
}
}
// Set input-hidden weights
for(i=0; i < n.n_inputs; i++) {
for(j=0; j < n.n_hidden; j++) {
float change = hidden_delta[j] * n.out_input[i];
n.w_input_hidden[i][j] += 0.5 * change + 0.5 * n.changes_input_hidden[i][j];
n.changes_input_hidden[i][j] = change;
}
}
// Calculate error
float error = 0.0f;
for (i=0; i < n.n_outputs; i++) {
error = error + 0.5f * pow(p.result[i] - n.out_output[i], 2);
}
return error;
}
void train_network(Pattern *patterns, int n_patterns, int n_iterations, NeuralNet nn) {
int i, j;
for (i=0; i < n_iterations; i++) {
float error = 0;
for (j=0; j < n_patterns; j++) {
update_pattern(patterns[j], nn);
error += back_propagate_network(patterns[j], nn);
}
if (i % 100 == 0) {
printf("Error is: %-.5f\n", error);
}
}
}
Pattern makePatternSingleOutput(int *data, int result) {
Pattern p;
p.data = data;
p.result = (int *)malloc(sizeof(int));
p.result[0] = result;
return p;
}
int main() {
srand((unsigned)time(NULL));
int n_inputs = 2;
int n_hidden = 4;
int n_outputs = 1;
// Build output layer
NeuralNet nn = buildNeuralNet(n_inputs, n_outputs, n_hidden);
// Build training samples
int _p1[] = {0,0};
Pattern p1 = makePatternSingleOutput(_p1, 0);
int _p2[] = {0,1};
Pattern p2 = makePatternSingleOutput(_p2, 1);
int _p3[] = {1,1};
Pattern p3 = makePatternSingleOutput(_p3, 1);
int _p4[] = {1,0};
Pattern p4 = makePatternSingleOutput(_p4, 1);
Pattern patterns[] = {p1, p2, p3, p4};
// Train the network
train_network(patterns, 4, 100, nn);
printf("\n\nTesting the network\n");
update_pattern(p1, nn);
for (int i=0; i < nn.n_outputs; i++) {
printf("Output: %f, expected: %i\n", nn.out_output[i], p1.result[i]);
}
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
typedef struct {
int n_inputs;
int n_hidden;
int n_outputs;
float *out_input;
float *out_hidden;
float *out_output;
float **changes_input_hidden;
float **changes_hidden_output;
float **w_input_hidden;
float **w_hidden_output;
} NeuralNet;
typedef struct {
int *result;
int *data;
} Pattern;
void buildLayer(float *arr, int n, float initial) {
int i=0;
while(i < n){
*arr = initial;
arr++;
i++;
}
}
float** buildWeightsLayer(int outer_n, int inner_n, float seed) {
int i;
int total = outer_n * inner_n;
float *data = (float*) malloc(sizeof(float) * total);
for(i=0; i < total; i++) {
if (seed == -1) {
data[i] = (float)rand()/(float)RAND_MAX;
} else {
data[i] = seed;
}
}
float **ptr_arr = malloc(sizeof(float*) * outer_n);
for(i=0; i < outer_n; i++) {
ptr_arr[i] = data + (i* inner_n);
}
return ptr_arr;
}
NeuralNet buildNeuralNet(int n_inputs, int n_outputs, int n_hidden) {
float *out_input = malloc(sizeof(float) * (n_inputs + 1));
float *out_hidden = malloc(sizeof(float) * n_hidden);
float *out_output = malloc(sizeof(float) * n_outputs);
// The +1 here is related to the Bias node
buildLayer(out_input, n_inputs + 1, 1.0f);
buildLayer(out_hidden, n_hidden, 1.0f);
buildLayer(out_output, n_outputs, 1.0f);
/*
* NOTE: Currently, our change matrix is an array of arrays. In order to use
* at best the CUDA programming model, I will convert this on a one-dimentional
* array. In this way, we can apply tiling algorithms do parallelize the
* matrix multiplication in a scalable way.
*/
// Build changes layer (momentum) and weight matrix
float **changes_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, 0.0f);
float **changes_hidden_output = buildWeightsLayer(n_hidden, n_outputs, 0.0f);
float **w_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, -1.0f);
float **w_hidden_output = buildWeightsLayer(n_hidden, n_outputs, -1.0f);
NeuralNet nn;
nn.n_inputs = n_inputs + 1;
nn.n_outputs = n_outputs;
nn.n_hidden = n_hidden;
nn.out_input = out_input;
nn.out_hidden = out_hidden;
nn.out_output = out_output;
nn.changes_input_hidden = changes_input_hidden;
nn.changes_hidden_output = changes_hidden_output;
nn.w_input_hidden = w_input_hidden;
nn.w_hidden_output = w_hidden_output;
return nn;
}
float *update_pattern(Pattern pattern, NeuralNet nn) {
/*
* NOTE: This is where CUDA will work the most. As the MLP cost function
* is tanh(x*w) and as each node in a layer can be updated indipendently
* we can treat x * w as a vector * matrix multiplication. Where x is a
* vector of all the imputs and w is a matrix containing all the weights for
* all the nodes.
*/
// Write inputs
int i, j;
float weighted_sum;
for(i=0; i < nn.n_inputs -1; i++) {
nn.out_input[i] = pattern.data[i];
}
// Write hidden
for(i=0; i < nn.n_hidden; i++) {
weighted_sum = 0.0f;
for(j=0; j < nn.n_inputs; j++) {
weighted_sum += nn.out_input[j] * nn.w_input_hidden[j][i];
}
nn.out_hidden[i] = tanh(weighted_sum);
}
// Write output
for(i=0; i < nn.n_outputs; i++) {
weighted_sum = 0.0f;
for(j=0; j < nn.n_hidden; j++) {
weighted_sum += nn.out_hidden[j] * nn.w_hidden_output[j][i];
}
nn.out_output[i] = tanh(weighted_sum);
}
return nn.out_output;
}
float dsigmoid(float y) {
return 1.0 - pow(y,2.0);
}
float back_propagate_network(Pattern p, NeuralNet n) {
/*
* This is the backpropagation process, where error is calculated and
* propagated back through the network in order to adjust the weights
* between neurons.
* NOTE: This section will also be parallelised. Unfortunately, the hidden delta
* needs to be calculated after the output delta. So we can only parallelize
* part of the process (this is what I think currently, I might be wrong!).
*/
int i, j;
float output_delta[n.n_outputs];
float hidden_delta[n.n_hidden];
// Calculate output delta
for (i=0; i < n.n_outputs; i++) {
float error = p.result[i] - n.out_output[i];
output_delta[i] = dsigmoid(n.out_output[i]) * error;
}
// Calculate hidden delta
for(i=0; i < n.n_hidden; i++) {
float error = 0.0f;
for (j=0; j < n.n_outputs; j++) {
error += output_delta[j] * n.w_hidden_output[i][j];
}
hidden_delta[i] = dsigmoid(n.out_hidden[i]) * error;
}
/*
* NOTE: Once the deltas have been calculated, we can update ALL the weights
* in once. This section fits perfectly with the CUDA programming model of
* grids and blocks.
*/
// Set hidden-output weights
for(i=0; i < n.n_hidden; i++) {
for (j=0; j < n.n_outputs; j++) {
float change = output_delta[j] * n.out_hidden[i];
n.w_hidden_output[i][j] += 0.5 * change + 0.5 * n.changes_hidden_output[i][j];
n.changes_hidden_output[i][j] = change;
}
}
// Set input-hidden weights
for(i=0; i < n.n_inputs; i++) {
for(j=0; j < n.n_hidden; j++) {
float change = hidden_delta[j] * n.out_input[i];
n.w_input_hidden[i][j] += 0.5 * change + 0.5 * n.changes_input_hidden[i][j];
n.changes_input_hidden[i][j] = change;
}
}
// Calculate error
float error = 0.0f;
for (i=0; i < n.n_outputs; i++) {
error = error + 0.5f * pow(p.result[i] - n.out_output[i], 2);
}
return error;
}
void train_network(Pattern *patterns, int n_patterns, int n_iterations, NeuralNet nn) {
int i, j;
for (i=0; i < n_iterations; i++) {
float error = 0;
for (j=0; j < n_patterns; j++) {
update_pattern(patterns[j], nn);
error += back_propagate_network(patterns[j], nn);
}
if (i % 100 == 0) {
printf("Error is: %-.5f\n", error);
}
}
}
Pattern makePatternSingleOutput(int *data, int result) {
/* NOTE: This is a domain-specific function. The application
* here is used to train the networks to recognize XOR.
* Said this, the network is perfectly scalable to different
* problems, with different output and inputs
*/
Pattern p;
p.data = data;
p.result = malloc(sizeof(int));
p.result[0] = result;
return p;
}
int main() {
srand((unsigned)time(NULL));
int n_inputs = 2;
int n_hidden = 4;
int n_outputs = 1;
// Build output layer
NeuralNet nn = buildNeuralNet(n_inputs, n_outputs, n_hidden);
// Build training samples
int _p1[] = {0,0};
Pattern p1 = makePatternSingleOutput(_p1, 0);
int _p2[] = {0,1};
Pattern p2 = makePatternSingleOutput(_p2, 1);
int _p3[] = {1,1};
Pattern p3 = makePatternSingleOutput(_p3, 1);
int _p4[] = {1,0};
Pattern p4 = makePatternSingleOutput(_p4, 1);
Pattern patterns[] = {p1, p2, p3, p4};
// Train the network
train_network(patterns, 4, 10000, nn);
printf("\n\nTesting the network\n");
update_pattern(p1, nn);
int i;
for (i=0; i < nn.n_outputs; i++) {
printf("Output: %f, expected: %i\n", nn.out_output[i], p1.result[i]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment