Last active
December 27, 2015 04:19
-
-
Save PirosB3/7266409 to your computer and use it in GitHub Desktop.
NEAURAL NETS NEARLY IN PARALLEL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#include <math.h> | |
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } | |
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) | |
{ | |
if (code != cudaSuccess) | |
{ | |
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); | |
if (abort) exit(code); | |
} | |
} | |
typedef struct { | |
int n_inputs; | |
int n_hidden; | |
int n_outputs; | |
float *out_input; | |
float *out_hidden; | |
float *out_output; | |
float **changes_input_hidden; | |
float **changes_hidden_output; | |
float **w_input_hidden; | |
float **w_hidden_output; | |
} NeuralNet; | |
typedef struct { | |
int *result; | |
int *data; | |
} Pattern; | |
void buildLayer(float *arr, int n, float initial) { | |
int i=0; | |
while(i < n){ | |
*arr = initial; | |
arr++; | |
i++; | |
} | |
} | |
float** buildWeightsLayer(int outer_n, int inner_n, float seed) { | |
int i; | |
int total = outer_n * inner_n; | |
float *data = (float*) malloc(sizeof(float) * total); | |
for(i=0; i < total; i++) { | |
if (seed == -1) { | |
data[i] = (float)rand()/(float)RAND_MAX; | |
} else { | |
data[i] = seed; | |
} | |
} | |
float **ptr_arr = (float **)malloc(sizeof(float*) * outer_n); | |
for(i=0; i < outer_n; i++) { | |
ptr_arr[i] = data + (i* inner_n); | |
} | |
return ptr_arr; | |
} | |
__global__ void calculateOutputForLayer(float *inputs, float *weight_matrix, int n_inputs, int n_outputs, float *output_matrix) { | |
output_matrix[n_outputs * threadIdx.y + threadIdx.x] = inputs[threadIdx.y] * weight_matrix[n_outputs * threadIdx.y + threadIdx.x]; | |
} | |
__global__ void doParallelReduction(float *d_matrix, int n_inputs, int n_outputs) { | |
// Specify first reduction size | |
float n = (float)n_inputs; | |
while(n > 1) { | |
n = ceil(n/2.0f); | |
int first_element = n_outputs * threadIdx.y + threadIdx.x; | |
int second_element = n_outputs * (threadIdx.y + n) + threadIdx.x; | |
if (threadIdx.y < n) { | |
d_matrix[first_element] += d_matrix[second_element]; | |
printf("Summing %i with %i. res is: %f\n", first_element, second_element, d_matrix[first_element]); | |
} | |
__syncthreads(); | |
printf("Done batch for %f\n", n); | |
} | |
} | |
int mainxx() { | |
// PARALLEL REDUCTION | |
int n_inputs = 3; | |
int n_outputs = 3; | |
// Build output matrix and copy | |
float matrix[27] = {1,2,3,2,3,3,3,1,2}; | |
float *d_matrix; | |
cudaMalloc((void**)&d_matrix, sizeof(float) * 9); | |
cudaMemcpy(d_matrix, matrix, sizeof(float) * 9, cudaMemcpyHostToDevice); | |
dim3 block(n_outputs, n_inputs / 2); | |
doParallelReduction<<<1, block>>>(d_matrix, n_inputs, n_outputs); | |
gpuErrchk( cudaPeekAtLastError() ); | |
gpuErrchk( cudaDeviceSynchronize() ); | |
float *res = (float *)malloc(sizeof(float) * 9); | |
cudaMemcpy(res, d_matrix, sizeof(float) * 9, cudaMemcpyDeviceToHost); | |
for(int i=0; i < 3; i++) { | |
printf("%f\n", res[i]); | |
} | |
return 0; | |
} | |
int main4() { | |
// Build inputs and weights | |
int n_inputs = 4; | |
int n_outputs = 2; | |
float inputs[4] = {1,2,3,4}; | |
float weights[8] = {2,4,2,2,2,7,2,2}; | |
// Build output matrix | |
float *d_output; | |
cudaMalloc((void**)&d_output, sizeof(float) * n_inputs * n_outputs); | |
// Copy layers | |
float *d_inputs; | |
cudaMalloc((void**)&d_inputs, sizeof(float) * 4); | |
cudaMemcpy(d_inputs, &inputs, sizeof(float) * 4, cudaMemcpyHostToDevice); | |
float *d_weights; | |
cudaMalloc((void**)&d_weights, sizeof(float) * 8); | |
cudaMemcpy(d_weights, &weights, sizeof(float) * 8, cudaMemcpyHostToDevice); | |
// Run compute kernel | |
dim3 block(n_outputs, n_inputs); | |
calculateOutputForLayer<<<1, block>>>(d_inputs, d_weights, n_inputs, n_outputs, d_output); | |
// Copy back output | |
float *output = (float *)malloc(sizeof(float) * n_inputs * n_outputs); | |
cudaMemcpy(output, d_output, sizeof(float) * n_inputs * n_outputs, cudaMemcpyDeviceToHost); | |
for (int i=0; i < n_inputs * n_outputs; i++) { | |
// printf("%f\n", output[i]); | |
} | |
cudaDeviceReset(); | |
return 0; | |
} | |
NeuralNet buildNeuralNet(int n_inputs, int n_outputs, int n_hidden) { | |
float *out_input = (float *)malloc(sizeof(float) * (n_inputs + 1)); | |
float *out_hidden = (float *)malloc(sizeof(float) * n_hidden); | |
float *out_output = (float *)malloc(sizeof(float) * n_outputs); | |
buildLayer(out_input, n_inputs + 1, 1.0f); | |
buildLayer(out_hidden, n_hidden, 1.0f); | |
buildLayer(out_output, n_outputs, 1.0f); | |
// Build changes layer | |
float **changes_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, 0.0f); | |
float **changes_hidden_output = buildWeightsLayer(n_hidden, n_outputs, 0.0f); | |
// Build weight matrix | |
float **w_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, -1.0f); | |
float **w_hidden_output = buildWeightsLayer(n_hidden, n_outputs, -1.0f); | |
NeuralNet nn; | |
nn.n_inputs = n_inputs + 1; | |
nn.n_outputs = n_outputs; | |
nn.n_hidden = n_hidden; | |
nn.out_input = out_input; | |
nn.out_hidden = out_hidden; | |
nn.out_output = out_output; | |
nn.changes_input_hidden = changes_input_hidden; | |
nn.changes_hidden_output = changes_hidden_output; | |
nn.w_input_hidden = w_input_hidden; | |
nn.w_hidden_output = w_hidden_output; | |
return nn; | |
} | |
float *update_pattern(Pattern pattern, NeuralNet nn) { | |
// Write inputs | |
int i, j; | |
float weighted_sum; | |
for(i=0; i < nn.n_inputs -1; i++) { | |
nn.out_input[i] = pattern.data[i]; | |
} | |
// Write hidden | |
for(i=0; i < nn.n_hidden; i++) { | |
weighted_sum = 0.0f; | |
for(j=0; j < nn.n_inputs; j++) { | |
weighted_sum += nn.out_input[j] * nn.w_input_hidden[j][i]; | |
} | |
nn.out_hidden[i] = tanh(weighted_sum); | |
//printf("Hidden is : %f\n", nn.out_hidden[i]); | |
} | |
// Write output | |
for(i=0; i < nn.n_outputs; i++) { | |
weighted_sum = 0.0f; | |
for(j=0; j < nn.n_hidden; j++) { | |
weighted_sum += nn.out_hidden[j] * nn.w_hidden_output[j][i]; | |
} | |
nn.out_output[i] = tanh(weighted_sum); | |
//printf("Output is : %f\n", nn.out_output[i]); | |
} | |
return nn.out_output; | |
} | |
float dsigmoid(float y) { | |
return 1.0 - pow(y,2.0f); | |
} | |
float back_propagate_network(Pattern p, NeuralNet n) { | |
// Calculate deltas | |
int i, j; | |
float *output_delta =(float *) malloc(sizeof(float) * n.n_outputs); | |
float *hidden_delta = (float *) malloc(sizeof(float) * n.n_hidden); | |
// Calculate output delta | |
for (i=0; i < n.n_outputs; i++) { | |
float error = p.result[i] - n.out_output[i]; | |
output_delta[i] = dsigmoid(n.out_output[i]) * error; | |
} | |
// Calculate hidden delta | |
for(i=0; i < n.n_hidden; i++) { | |
float error = 0.0f; | |
for (j=0; j < n.n_outputs; j++) { | |
error += output_delta[j] * n.w_hidden_output[i][j]; | |
} | |
hidden_delta[i] = dsigmoid(n.out_hidden[i]) * error; | |
} | |
// Set hidden-output weights | |
for(i=0; i < n.n_hidden; i++) { | |
for (j=0; j < n.n_outputs; j++) { | |
float change = output_delta[j] * n.out_hidden[i]; | |
n.w_hidden_output[i][j] += 0.5 * change + 0.5 * n.changes_hidden_output[i][j]; | |
n.changes_hidden_output[i][j] = change; | |
} | |
} | |
// Set input-hidden weights | |
for(i=0; i < n.n_inputs; i++) { | |
for(j=0; j < n.n_hidden; j++) { | |
float change = hidden_delta[j] * n.out_input[i]; | |
n.w_input_hidden[i][j] += 0.5 * change + 0.5 * n.changes_input_hidden[i][j]; | |
n.changes_input_hidden[i][j] = change; | |
} | |
} | |
// Calculate error | |
float error = 0.0f; | |
for (i=0; i < n.n_outputs; i++) { | |
error = error + 0.5f * pow(p.result[i] - n.out_output[i], 2); | |
} | |
return error; | |
} | |
void train_network(Pattern *patterns, int n_patterns, int n_iterations, NeuralNet nn) { | |
int i, j; | |
for (i=0; i < n_iterations; i++) { | |
float error = 0; | |
for (j=0; j < n_patterns; j++) { | |
update_pattern(patterns[j], nn); | |
error += back_propagate_network(patterns[j], nn); | |
} | |
if (i % 100 == 0) { | |
printf("Error is: %-.5f\n", error); | |
} | |
} | |
} | |
Pattern makePatternSingleOutput(int *data, int result) { | |
Pattern p; | |
p.data = data; | |
p.result = (int *)malloc(sizeof(int)); | |
p.result[0] = result; | |
return p; | |
} | |
int main() { | |
srand((unsigned)time(NULL)); | |
int n_inputs = 2; | |
int n_hidden = 4; | |
int n_outputs = 1; | |
// Build output layer | |
NeuralNet nn = buildNeuralNet(n_inputs, n_outputs, n_hidden); | |
// Build training samples | |
int _p1[] = {0,0}; | |
Pattern p1 = makePatternSingleOutput(_p1, 0); | |
int _p2[] = {0,1}; | |
Pattern p2 = makePatternSingleOutput(_p2, 1); | |
int _p3[] = {1,1}; | |
Pattern p3 = makePatternSingleOutput(_p3, 1); | |
int _p4[] = {1,0}; | |
Pattern p4 = makePatternSingleOutput(_p4, 1); | |
Pattern patterns[] = {p1, p2, p3, p4}; | |
// Train the network | |
train_network(patterns, 4, 100, nn); | |
printf("\n\nTesting the network\n"); | |
update_pattern(p1, nn); | |
for (int i=0; i < nn.n_outputs; i++) { | |
printf("Output: %f, expected: %i\n", nn.out_output[i], p1.result[i]); | |
} | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#include <math.h> | |
typedef struct { | |
int n_inputs; | |
int n_hidden; | |
int n_outputs; | |
float *out_input; | |
float *out_hidden; | |
float *out_output; | |
float **changes_input_hidden; | |
float **changes_hidden_output; | |
float **w_input_hidden; | |
float **w_hidden_output; | |
} NeuralNet; | |
typedef struct { | |
int *result; | |
int *data; | |
} Pattern; | |
void buildLayer(float *arr, int n, float initial) { | |
int i=0; | |
while(i < n){ | |
*arr = initial; | |
arr++; | |
i++; | |
} | |
} | |
float** buildWeightsLayer(int outer_n, int inner_n, float seed) { | |
int i; | |
int total = outer_n * inner_n; | |
float *data = (float*) malloc(sizeof(float) * total); | |
for(i=0; i < total; i++) { | |
if (seed == -1) { | |
data[i] = (float)rand()/(float)RAND_MAX; | |
} else { | |
data[i] = seed; | |
} | |
} | |
float **ptr_arr = malloc(sizeof(float*) * outer_n); | |
for(i=0; i < outer_n; i++) { | |
ptr_arr[i] = data + (i* inner_n); | |
} | |
return ptr_arr; | |
} | |
NeuralNet buildNeuralNet(int n_inputs, int n_outputs, int n_hidden) { | |
float *out_input = malloc(sizeof(float) * (n_inputs + 1)); | |
float *out_hidden = malloc(sizeof(float) * n_hidden); | |
float *out_output = malloc(sizeof(float) * n_outputs); | |
// The +1 here is related to the Bias node | |
buildLayer(out_input, n_inputs + 1, 1.0f); | |
buildLayer(out_hidden, n_hidden, 1.0f); | |
buildLayer(out_output, n_outputs, 1.0f); | |
/* | |
* NOTE: Currently, our change matrix is an array of arrays. In order to use | |
* at best the CUDA programming model, I will convert this on a one-dimentional | |
* array. In this way, we can apply tiling algorithms do parallelize the | |
* matrix multiplication in a scalable way. | |
*/ | |
// Build changes layer (momentum) and weight matrix | |
float **changes_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, 0.0f); | |
float **changes_hidden_output = buildWeightsLayer(n_hidden, n_outputs, 0.0f); | |
float **w_input_hidden = buildWeightsLayer(n_inputs + 1, n_hidden, -1.0f); | |
float **w_hidden_output = buildWeightsLayer(n_hidden, n_outputs, -1.0f); | |
NeuralNet nn; | |
nn.n_inputs = n_inputs + 1; | |
nn.n_outputs = n_outputs; | |
nn.n_hidden = n_hidden; | |
nn.out_input = out_input; | |
nn.out_hidden = out_hidden; | |
nn.out_output = out_output; | |
nn.changes_input_hidden = changes_input_hidden; | |
nn.changes_hidden_output = changes_hidden_output; | |
nn.w_input_hidden = w_input_hidden; | |
nn.w_hidden_output = w_hidden_output; | |
return nn; | |
} | |
float *update_pattern(Pattern pattern, NeuralNet nn) { | |
/* | |
* NOTE: This is where CUDA will work the most. As the MLP cost function | |
* is tanh(x*w) and as each node in a layer can be updated indipendently | |
* we can treat x * w as a vector * matrix multiplication. Where x is a | |
* vector of all the imputs and w is a matrix containing all the weights for | |
* all the nodes. | |
*/ | |
// Write inputs | |
int i, j; | |
float weighted_sum; | |
for(i=0; i < nn.n_inputs -1; i++) { | |
nn.out_input[i] = pattern.data[i]; | |
} | |
// Write hidden | |
for(i=0; i < nn.n_hidden; i++) { | |
weighted_sum = 0.0f; | |
for(j=0; j < nn.n_inputs; j++) { | |
weighted_sum += nn.out_input[j] * nn.w_input_hidden[j][i]; | |
} | |
nn.out_hidden[i] = tanh(weighted_sum); | |
} | |
// Write output | |
for(i=0; i < nn.n_outputs; i++) { | |
weighted_sum = 0.0f; | |
for(j=0; j < nn.n_hidden; j++) { | |
weighted_sum += nn.out_hidden[j] * nn.w_hidden_output[j][i]; | |
} | |
nn.out_output[i] = tanh(weighted_sum); | |
} | |
return nn.out_output; | |
} | |
float dsigmoid(float y) { | |
return 1.0 - pow(y,2.0); | |
} | |
float back_propagate_network(Pattern p, NeuralNet n) { | |
/* | |
* This is the backpropagation process, where error is calculated and | |
* propagated back through the network in order to adjust the weights | |
* between neurons. | |
* NOTE: This section will also be parallelised. Unfortunately, the hidden delta | |
* needs to be calculated after the output delta. So we can only parallelize | |
* part of the process (this is what I think currently, I might be wrong!). | |
*/ | |
int i, j; | |
float output_delta[n.n_outputs]; | |
float hidden_delta[n.n_hidden]; | |
// Calculate output delta | |
for (i=0; i < n.n_outputs; i++) { | |
float error = p.result[i] - n.out_output[i]; | |
output_delta[i] = dsigmoid(n.out_output[i]) * error; | |
} | |
// Calculate hidden delta | |
for(i=0; i < n.n_hidden; i++) { | |
float error = 0.0f; | |
for (j=0; j < n.n_outputs; j++) { | |
error += output_delta[j] * n.w_hidden_output[i][j]; | |
} | |
hidden_delta[i] = dsigmoid(n.out_hidden[i]) * error; | |
} | |
/* | |
* NOTE: Once the deltas have been calculated, we can update ALL the weights | |
* in once. This section fits perfectly with the CUDA programming model of | |
* grids and blocks. | |
*/ | |
// Set hidden-output weights | |
for(i=0; i < n.n_hidden; i++) { | |
for (j=0; j < n.n_outputs; j++) { | |
float change = output_delta[j] * n.out_hidden[i]; | |
n.w_hidden_output[i][j] += 0.5 * change + 0.5 * n.changes_hidden_output[i][j]; | |
n.changes_hidden_output[i][j] = change; | |
} | |
} | |
// Set input-hidden weights | |
for(i=0; i < n.n_inputs; i++) { | |
for(j=0; j < n.n_hidden; j++) { | |
float change = hidden_delta[j] * n.out_input[i]; | |
n.w_input_hidden[i][j] += 0.5 * change + 0.5 * n.changes_input_hidden[i][j]; | |
n.changes_input_hidden[i][j] = change; | |
} | |
} | |
// Calculate error | |
float error = 0.0f; | |
for (i=0; i < n.n_outputs; i++) { | |
error = error + 0.5f * pow(p.result[i] - n.out_output[i], 2); | |
} | |
return error; | |
} | |
void train_network(Pattern *patterns, int n_patterns, int n_iterations, NeuralNet nn) { | |
int i, j; | |
for (i=0; i < n_iterations; i++) { | |
float error = 0; | |
for (j=0; j < n_patterns; j++) { | |
update_pattern(patterns[j], nn); | |
error += back_propagate_network(patterns[j], nn); | |
} | |
if (i % 100 == 0) { | |
printf("Error is: %-.5f\n", error); | |
} | |
} | |
} | |
Pattern makePatternSingleOutput(int *data, int result) { | |
/* NOTE: This is a domain-specific function. The application | |
* here is used to train the networks to recognize XOR. | |
* Said this, the network is perfectly scalable to different | |
* problems, with different output and inputs | |
*/ | |
Pattern p; | |
p.data = data; | |
p.result = malloc(sizeof(int)); | |
p.result[0] = result; | |
return p; | |
} | |
int main() { | |
srand((unsigned)time(NULL)); | |
int n_inputs = 2; | |
int n_hidden = 4; | |
int n_outputs = 1; | |
// Build output layer | |
NeuralNet nn = buildNeuralNet(n_inputs, n_outputs, n_hidden); | |
// Build training samples | |
int _p1[] = {0,0}; | |
Pattern p1 = makePatternSingleOutput(_p1, 0); | |
int _p2[] = {0,1}; | |
Pattern p2 = makePatternSingleOutput(_p2, 1); | |
int _p3[] = {1,1}; | |
Pattern p3 = makePatternSingleOutput(_p3, 1); | |
int _p4[] = {1,0}; | |
Pattern p4 = makePatternSingleOutput(_p4, 1); | |
Pattern patterns[] = {p1, p2, p3, p4}; | |
// Train the network | |
train_network(patterns, 4, 10000, nn); | |
printf("\n\nTesting the network\n"); | |
update_pattern(p1, nn); | |
int i; | |
for (i=0; i < nn.n_outputs; i++) { | |
printf("Output: %f, expected: %i\n", nn.out_output[i], p1.result[i]); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment