Created
June 23, 2013 23:33
-
-
Save usamadar/5846931 to your computer and use it in GitHub Desktop.
This code can demonstrate 1. Pinning the affinity 2. having NUMA local memory alloc (numa_alloc_local) 3. Also some other NUMA functions like bitmask If you want to compile it, you need the libnuma gcc numatest1.c -lnuma -lpthread -lrt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <errno.h> | |
#include <asm/ioctl.h> | |
#include <sys/mman.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <numaif.h> | |
#include <numa.h> | |
#include <sched.h> | |
#include <time.h> | |
#include <pthread.h> | |
/* Used as argument to thread_start() */ | |
typedef struct thread_info_s | |
{ | |
void **x; | |
size_t core; | |
size_t N; | |
size_t M; | |
size_t B; | |
size_t T; | |
} thread_info; | |
#define READ_BLOCK(block, size)\ | |
do{\ | |
int var, idx;\ | |
var = idx = 0;\ | |
while((idx + sizeof(int)) <= size)\ | |
{\ | |
var = (*((int *)(block + idx)))++;\ | |
idx += sizeof(int);\ | |
}\ | |
}while(0); | |
#define WRITE_BLOCK(block, size)\ | |
do{\ | |
int var, idx;\ | |
idx = 0;\ | |
var = 0xABABABAB;\ | |
while((idx + sizeof(int)) <= size)\ | |
{\ | |
*((int *)(block + idx)) = var;\ | |
idx += sizeof(int);\ | |
}\ | |
}while(0); | |
/* Utility functions */ | |
void pin_to_core(size_t core) | |
{ | |
cpu_set_t cpuset; | |
CPU_ZERO(&cpuset); | |
CPU_SET(core, &cpuset); | |
//pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); | |
sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); | |
} | |
struct timespec timespec_diff(struct timespec start, struct timespec end) | |
{ | |
struct timespec temp; | |
if ((end.tv_nsec-start.tv_nsec)<0) { | |
temp.tv_sec = end.tv_sec-start.tv_sec-1; | |
temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec; | |
} else { | |
temp.tv_sec = end.tv_sec-start.tv_sec; | |
temp.tv_nsec = end.tv_nsec-start.tv_nsec; | |
} | |
return temp; | |
} | |
int64_t convert_timespec_to_ns(struct timespec *a ) | |
{ | |
const int64_t a_ns = (a->tv_sec * (int64_t)1000000000) + a->tv_nsec; | |
return a_ns; | |
} | |
/*Print Functions */ | |
void print_numa_bitmask(struct bitmask *bm) | |
{ | |
for(size_t i=0;i<bm->size;++i) | |
{ | |
printf("%d ", numa_bitmask_isbitset(bm, i)); | |
} | |
} | |
void print_timespec(struct timespec *t) | |
{ | |
/*Print s:ms:us.ns*/ | |
printf("%lds:%ldms:%ldus.%ldns",t->tv_sec, t->tv_nsec/(1000*1000), | |
(t->tv_nsec%(1000*1000))/1000, t->tv_nsec%1000); | |
} | |
/*This thread allocates a piece of memory and pins it to a given core*/ | |
void* thread1(void *arg) | |
{ | |
thread_info *t = (thread_info *) arg; | |
struct timespec c1, c2, diff; | |
size_t N = t->N, M = t->M, core = t->core, T = t->T, B = t->B; | |
pin_to_core(core); | |
void* y = numa_alloc_local(N); | |
//Reduce N by block size to ensure we do not cross the mem bdry | |
N = N - (B-1); | |
clock_gettime(CLOCK_REALTIME, &c1); | |
char c; | |
for (size_t i = 0;i<M;++i) | |
//for(size_t j = 0;j<N;++j) | |
for(size_t j = 0;j<T;++j) | |
{ | |
#if 1 | |
#ifndef SEQ | |
//*(((int*)y) + ((j * 1009) % N)) += 1; | |
*(int *)(y + ((j * 1009) % N)) += 1; | |
#else | |
*(((char*)y) + j) += 1; | |
#endif | |
#else | |
READ_BLOCK((((char*)y) + ((j * 1009) % N)), B); | |
#endif | |
} | |
clock_gettime(CLOCK_REALTIME, &c2); | |
diff = timespec_diff(c1, c2); | |
printf("Elapsed read/write by same thread that allocated on core %ld : ", | |
core); | |
print_timespec(&diff); | |
printf("\n"); | |
*(t->x) = y; | |
} | |
/*This thread accesses a memory already allocated by one core (and pinned). | |
The access is simple a "+=", which is a single instruction read and a single | |
instruction write */ | |
void* thread2(void *arg) | |
{ | |
thread_info *t = (thread_info *) arg; | |
struct timespec c1, c2, diff; | |
size_t N = t->N, M = t->M, core = t->core, T = t->T, B = t->B; | |
void *x = *(t->x); | |
double access_time; | |
N = N - (B-1); | |
pin_to_core(core); | |
clock_gettime(CLOCK_REALTIME, &c1); | |
char c; | |
/*Memory access loop. 1 byte of memory is read & written to. | |
Try the access "N" times and iterate it "M" times | |
*/ | |
for (size_t i = 0;i<M;++i) | |
//for(size_t j = 0;j<N/10;++j) | |
for(size_t j = 0;j<T;++j) | |
{ | |
#if 1 | |
#ifndef SEQ | |
//*(int *)(x + ((j * 1009) % N)) += 1; | |
//READ_BLOCK((x + ((j * 1009) % N)), B); | |
READ_BLOCK((x + ((i+1)* rand()) % N), B); | |
#else | |
*(((char*)x) + j) += 1; | |
#endif | |
#else | |
READ_BLOCK((((char*)x) + ((j * 1009) % N)), B); | |
#endif | |
} | |
clock_gettime(CLOCK_REALTIME, &c2); | |
diff = timespec_diff(c1, c2); | |
/*printf("Elapsed read/write by thread on core %ld : ", core); | |
print_timespec(&diff); | |
printf("\n");*/ | |
//access_time = convert_timespec_to_ns(&diff) / (T*M); | |
access_time = convert_timespec_to_ns(&diff) / (M)/ 1000 /1000; | |
printf("Access Performance by thread on core %ld : %fms\n", core,access_time); | |
} | |
int main(int argc, const char **argv) | |
{ | |
void* x; | |
pthread_attr_t attr; | |
pthread_t t1, t2; | |
//Allocate N units of memory, iterate M tumes and each time read T blocks | |
size_t N = 10000000, M = 5, T = 10000, B; | |
thread_info *tinfo; | |
int num_threads = 1; | |
N = atoi(argv[1]); | |
B = atoi(argv[2]); | |
T = atoi(argv[3]); | |
printf("%d\n", N); | |
//int numcpus = numa_num_task_cpus(); | |
int numcpus = numa_num_configured_cpus(); | |
printf("numa_available() %d\n", numa_available()); | |
numa_set_localalloc(); | |
struct bitmask* bm = numa_bitmask_alloc(numcpus); | |
for (int i=0;i<=numa_max_node();++i) | |
{ | |
numa_node_to_cpus(i, bm); | |
printf("numa node %d ",i); | |
print_numa_bitmask(bm); | |
printf("%ld\n",numa_node_size(i, 0)); | |
} | |
numa_bitmask_free(bm); | |
pthread_attr_init(&attr); | |
tinfo = malloc(num_threads * sizeof(thread_info)); | |
tinfo[0].x = &x; | |
tinfo[0].core = 0; | |
tinfo[0].N = N; | |
tinfo[0].M = M; | |
tinfo[0].B = B; | |
tinfo[0].T = T; | |
pthread_create(&t1, &attr, thread1, &tinfo[0]); | |
pthread_join(t1, NULL); | |
for (size_t i = 0;i<numcpus;++i) | |
{ | |
tinfo[0].core = i; | |
pthread_create(&t2, &attr, thread2, &tinfo[0]); | |
pthread_join(t2, NULL); | |
} | |
numa_free(x, N); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment