Created
October 3, 2012 03:49
-
-
Save jiewmeng/3824894 to your computer and use it in GitHub Desktop.
Matrix Multiplication Optimization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <time.h> | |
#include <sys/time.h> | |
#include <assert.h> | |
#include <algorithm> | |
typedef struct | |
{ | |
int size; | |
float ** element; | |
} matrix; | |
/** | |
* Returns the number of nanoseconds since the beginning of the program | |
* Requires linking with librt (-lrt flag on GCC) | |
**/ | |
long long wall_clock_time() | |
{ | |
#ifdef __linux__ | |
struct timespec tp; | |
clock_gettime(CLOCK_REALTIME, &tp); | |
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll); | |
#else | |
#warning "Your timer resoultion might be too low. Compile on Linux and link with librt" | |
struct timeval tv; | |
gettimeofday(&tv, NULL); | |
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll); | |
#endif | |
} | |
/** | |
* Allocates memory for a matrix of size SIZE | |
* The memory is allocated row-major order, i.e. | |
* elements from the same row are allocated at contiguous | |
* memory addresses. | |
**/ | |
void allocate_matrix(matrix* m, int size) { | |
int i, j; | |
m->size = size; | |
// allocate array for all the rows | |
m->element = (float**)malloc(sizeof(float*) * size); | |
if (m->element == NULL) { | |
fprintf(stderr, "Out of memory\n"); | |
exit(1); | |
} | |
// allocate an array for each row of the matrix | |
for (i = 0; i < size; i++) { | |
m->element[i] = (float*)malloc(sizeof(float) * size); | |
if (m->element[i] == NULL) | |
{ | |
fprintf(stderr, "Out of memory\n"); | |
exit(1); | |
} | |
} | |
} | |
/** | |
* Initializes the elements of the matrix with | |
* random values between 0 and 9 | |
**/ | |
void init_matrix(matrix m) | |
{ | |
int i, j; | |
int size = m.size; | |
for (i = 0; i < size; i++) | |
for (j = 0; j < size; j++) | |
{ | |
m.element[i][j] = rand() % 10; | |
} | |
} | |
/** | |
* Initializes the elements of the matrix with | |
* element 0. | |
**/ | |
void init_matrix_zero(matrix m) | |
{ | |
int i, j; | |
int size = m.size; | |
for (i = 0; i < size; i++) | |
for (j = 0; j < size; j++) | |
{ | |
m.element[i][j] = 0.0; | |
} | |
} | |
/** | |
* Frees the memory of matrix @m | |
**/ | |
void free_matrix(matrix m) | |
{ | |
int i; | |
int size = m.size; | |
for (i = 0; i < size; i++) | |
free(m.element[i]); | |
free(m.element); | |
} | |
void transpose(int size, matrix m) { | |
int i, j; | |
for (i = 0; i < size; i++) | |
for (j = 0; j < size; j++) | |
std::swap(m.element[i][j], m.element[j][i]); | |
} | |
/** | |
* Multiplies matrix @a with matrix @b storing | |
* the result in matrix @result | |
* | |
* The multiplication algorithm is the O(n^3) | |
* algorithm | |
*/ | |
void mm(matrix a, matrix b, matrix result) { | |
int i, j, k; | |
int size = a.size; | |
long long before, after; | |
before = wall_clock_time(); | |
// Do the multiplication | |
transpose(size, b); // transpose the matrix to reduce cache miss | |
for (i = 0; i < size; i++) | |
for (j = 0; j < size; j++) { | |
int tmp = 0; // save memory writes | |
for(k = 0; k < size; k++) | |
tmp += a.element[i][k] * b.element[j][k]; | |
result.element[i][j] = tmp; | |
} | |
//transpose(size, b); | |
after = wall_clock_time(); | |
fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(after - before))/1000000000); | |
} | |
void print_matrix(matrix m) | |
{ | |
int i, j; | |
int size = m.size; | |
for (i = 0; i < size; i++) | |
{ | |
printf("row =%4d: ", i); | |
for (j = 0; j < size; j++) | |
printf("%6.2f ", m.element[i][j]); | |
printf("\n"); | |
} | |
} | |
void work(int size) | |
{ | |
matrix a, b, result; | |
// Allocate memory for matrices | |
allocate_matrix(&a, size); | |
allocate_matrix(&b, size); | |
allocate_matrix(&result, size); | |
// Initialize matrix elements | |
init_matrix(a); | |
init_matrix(b); | |
init_matrix_zero(result); | |
// Perform sequential matrix multiplication | |
mm(a, b, result); | |
// Print the result matrix | |
//print_matrix(result); | |
free_matrix(a); | |
free_matrix(b); | |
free_matrix(result); | |
} | |
int main(int argc, char ** argv) | |
{ | |
int size; | |
srand(0); | |
if (argc >= 2) | |
size = atoi(argv[1]); | |
else | |
size = 1024; | |
fprintf(stderr,"Sequential matrix multiplication of size %d\n", size); | |
// Multiply the matrices | |
work(size); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define SIZE 1024 | |
#include <iostream> | |
#include <string> | |
#include <cstdio> | |
#include <stdlib.h> | |
using namespace std; | |
long long wall_clock_time(); | |
void zero_matrix(float m[SIZE][SIZE]); | |
void transpose_matrix(float m[SIZE][SIZE]); | |
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]); | |
void print_matrix(float m[SIZE][SIZE]); | |
void random_matrix(float m[SIZE][SIZE]); | |
void zero_matrix(float m[SIZE][SIZE]) { | |
for (int i = 0; i < SIZE; i++) | |
for (int j = 0; j < SIZE; j++) | |
m[i][j] = 0; | |
} | |
void transpose_matrix(float m[SIZE][SIZE]) { | |
for (int i = 0; i < SIZE; i++) | |
for (int j = i + 1; j < SIZE; j++) | |
swap(m[i][j], m[j][i]); | |
} | |
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]) { | |
long long start, end; | |
start = wall_clock_time(); | |
transpose_matrix(m2); | |
for (int i = 0; i < SIZE; i++) { | |
for (int j = 0; j < SIZE; j++) { | |
float tmp = 0; | |
for (int k = 0; k < SIZE; k++) | |
tmp += m1[i][k] * m2[j][k]; | |
result[i][j] = tmp; | |
} | |
} | |
end = wall_clock_time(); | |
fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(end - start))/1000000000); | |
} | |
void print_matrix(float m[SIZE][SIZE]) { | |
for (int i = 0; i < SIZE; i++) { | |
for (int j = 0; j < SIZE; j++) { | |
cout << m[i][j] << " "; | |
} | |
cout << endl; | |
} | |
} | |
void random_matrix(float m[SIZE][SIZE]) { | |
for (int i = 0; i < SIZE; i++) | |
for (int j = 0; j < SIZE; j++) | |
m[i][j] = rand() % 10; | |
} | |
int main() { | |
// init matrixes | |
static float m1[SIZE][SIZE]; | |
static float m2[SIZE][SIZE]; | |
static float result[SIZE][SIZE]; | |
random_matrix(m1); | |
random_matrix(m2); | |
zero_matrix(result); | |
// do matrix multiply | |
matrix_multiply(m1, m2, result); | |
// print result | |
print_matrix(result); | |
// cleanup | |
return 0; | |
} | |
/******************************************** | |
* Helpers | |
*******************************************/ | |
long long wall_clock_time() { | |
#ifdef __linux__ | |
struct timespec tp; | |
clock_gettime(CLOCK_REALTIME, &tp); | |
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll); | |
#else | |
#warning "Your timer resoultion might be too low. Compile on Linux and link with librt" | |
struct timeval tv; | |
gettimeofday(&tv, NULL); | |
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll); | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment