Created
January 24, 2013 16:04
-
-
Save 5kg/4623909 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
操作系统编程中,程序的性能至关重要。现代计算机体系结构中,CPU缓存对性能起着至关重要的影响。本系列文章通过实验,说明一些合理利用CPU缓存的建议。 | |
== 保持数据的局部性 | |
这一次,依然在第一篇的基础上修改, `foo` 完全不变,而仅仅是把 `bar()` ,改为交错的读取内存。代码如下: | |
[source, c] | |
---- | |
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <pthread.h> | |
#include <string.h> | |
#include <assert.h> | |
#define tic() do { struct timespec ts_start, ts_end; \ | |
clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + \ | |
(double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
#define test(func) memset(s, 0, sizeof(s)); \ | |
ans = 0; \ | |
tic(); \ | |
for (int i = 0; i < NUM_THREADS; ++i) \ | |
pthread_create(threads + i, NULL, func, s + i); \ | |
for (int i = 0; i < NUM_THREADS; ++i) { \ | |
pthread_join(threads[i], NULL); \ | |
ans += s[i]; } \ | |
toc(); \ | |
assert(ans == ANS); | |
#define NUM_THREADS 4 | |
#define N 536870912 | |
void* foo(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
for (int i = (r - s); i < N; i += NUM_THREADS) | |
*r += num[i]; | |
return NULL; | |
} | |
void* bar(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int idx = r - s; | |
int block = N/NUM_THREADS; | |
int start = idx * block, end = start + block; | |
for (int j = 0; j < NUM_THREADS; ++j) | |
for (int i = start+j; i < end; i+=NUM_THREADS) | |
*r += num[i]; | |
return NULL; | |
} | |
int* num; | |
int s[NUM_THREADS]; | |
int main() | |
{ | |
pthread_t threads[NUM_THREADS]; | |
int ans, ANS = 0; | |
num = (int*) malloc(sizeof(int) * N); | |
srand(42); | |
for (int i = 0; i < N; ++i) { | |
num[i] = rand() % 100; | |
ANS += num[i]; | |
} | |
test(foo); | |
test(bar); | |
return 0; | |
} | |
---- | |
我们应当预期这时的 `foo()` `bar()` 速度大致相同,因为他们均为交错读取,缓存未命中率应该大致相同。 | |
而测试结果如下: | |
---- | |
$ gcc ping-pong.c -std=gnu99 -lpthread -O2 ; ./a.out | |
1.171061s | |
0.406542s | |
$ gcc ping-pong1.c -std=gnu99 -lpthread -O2 ; ./a.out | |
1.107319s | |
1.496597s | |
---- | |
奇怪的是 `bar()` 这次大大慢过了 `foo()` 。而两者的缓存未命中数是一样的: | |
---- | |
$ valgrind --tool=cachegrind --cachegrind-out-file=profile ./a.out | |
.... | |
$ cg_annotate profile --auto=yes --show=D1mr --context=1 | |
.... | |
-- line 51 ---------------------------------------- | |
. void* foo(void* ptr) | |
0 { | |
0 int* r = (int*) ptr; | |
. | |
0 for (int i = (r - s); i < N; i += NUM_THREADS) | |
16,388 *r += num[i]; | |
0 return NULL; | |
0 } | |
. | |
-- line 59 ---------------------------------------- | |
-- line 60 ---------------------------------------- | |
. void* bar(void* ptr) | |
0 { | |
0 int* r = (int*) ptr; | |
0 int idx = r - s; | |
0 int block = N/NUM_THREADS; | |
0 int start = idx * block, end = start + block; | |
. | |
0 for (int j = 0; j < NUM_THREADS; ++j) | |
0 for (int i = start+j; i < end; i+=NUM_THREADS) | |
16,399 *r += num[i]; | |
0 return NULL; | |
0 } | |
---- | |
这又作何解释呢?我认为区别在于多个核心共享的L3缓存的命中率(由于没有合适的profile工具,(Intel的profiler应该可以看L3的命中率),以下仅为猜想)。 | |
image:cpu.jpg[] | |
`foo()` 由于交错的遍历整块内存,在每个线程进度大体一致的前提下,各个线程在任意时刻应该访问相近的内存区域。由于现在CPU的第三级缓存是由多个核共享的,这样便能提高L3缓存的命中率。`bar()` 则将数组分块,各个线程会互相把各自的数据踢出L3缓存,这便导致了性能的损失。 | |
这样由于共享缓存的存在,编程中应该把进程间的数据局部性也考虑进来。Linux内核开发者,也做了一些相关的尝试,如展示时所提到引发PostgresSQL Bug的,将线程尽可能在同一CPU的不同核心调度等思路。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <pthread.h> | |
#include <string.h> | |
#include <assert.h> | |
#define tic() do { struct timespec ts_start, ts_end; clock_gettime(CLOCK_MONOTONIC, &ts_start) | |
#define toc() clock_gettime(CLOCK_MONOTONIC, &ts_end); \ | |
printf("%lfs\n", (ts_end.tv_sec - ts_start.tv_sec) + (double)(ts_end.tv_nsec - ts_start.tv_nsec)/1e9); } \ | |
while (0) | |
#define test(func) memset(s, 0, sizeof(s)); \ | |
ans = 0; \ | |
tic(); \ | |
for (int i = 0; i < NUM_THREADS; ++i) \ | |
pthread_create(threads + i, NULL, func, s + i); \ | |
for (int i = 0; i < NUM_THREADS; ++i) { \ | |
pthread_join(threads[i], NULL); \ | |
ans += s[i]; } \ | |
toc(); \ | |
assert(ans == ANS); | |
#define NUM_THREADS 4 | |
#define N 65536 | |
void* foo(void *); | |
void* bar(void *); | |
int* num; | |
int s[NUM_THREADS]; | |
int main() | |
{ | |
pthread_t threads[NUM_THREADS]; | |
int ans, ANS = 0; | |
num = (int*) malloc(sizeof(int) * N); | |
srand(42); | |
for (int i = 0; i < N; ++i) { | |
num[i] = rand() % 100; | |
ANS += num[i]; | |
} | |
test(foo); | |
test(bar); | |
return 0; | |
} | |
void* foo(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
for (int i = (r - s); i < N; i += NUM_THREADS) | |
*r += num[i]; | |
return NULL; | |
} | |
void* bar(void* ptr) | |
{ | |
int* r = (int*) ptr; | |
int idx = r - s; | |
int block = N/NUM_THREADS; | |
int start = idx * block, end = start + block; | |
for (int j = 0; j < NUM_THREADS; ++j) | |
for (int i = start+j; i < end; i+=NUM_THREADS) | |
*r += num[i]; | |
return NULL; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment