Last active
August 13, 2018 10:06
-
-
Save mikmart/fb191894f29af03f69d3a0705ef1ee48 to your computer and use it in GitHub Desktop.
Efficiency of rowSums() vs colSums() in the R C API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
matsums.o: file format pe-x86-64 | |
Disassembly of section .text: | |
0000000000000000 <C_rowSums>: | |
0: 41 54 push r12 | |
2: 55 push rbp | |
3: 57 push rdi | |
4: 56 push rsi | |
5: 53 push rbx | |
6: 48 83 ec 20 sub rsp,0x20 | |
a: 48 8b 05 00 00 00 00 mov rax,QWORD PTR [rip+0x0] # 11 <C_rowSums+0x11> | |
11: 49 89 cc mov r12,rcx | |
14: 48 8b 10 mov rdx,QWORD PTR [rax] | |
17: e8 00 00 00 00 call 1c <C_rowSums+0x1c> | |
1c: 48 89 c1 mov rcx,rax | |
1f: e8 00 00 00 00 call 24 <C_rowSums+0x24> | |
24: 48 89 c1 mov rcx,rax | |
27: 48 89 c3 mov rbx,rax | |
2a: e8 00 00 00 00 call 2f <C_rowSums+0x2f> | |
2f: 48 63 30 movsxd rsi,DWORD PTR [rax] | |
32: 48 89 d9 mov rcx,rbx | |
35: e8 00 00 00 00 call 3a <C_rowSums+0x3a> | |
3a: b9 01 00 00 00 mov ecx,0x1 | |
3f: 48 63 78 04 movsxd rdi,DWORD PTR [rax+0x4] | |
43: e8 00 00 00 00 call 48 <C_rowSums+0x48> | |
48: 48 89 f2 mov rdx,rsi | |
4b: b9 0e 00 00 00 mov ecx,0xe | |
50: e8 00 00 00 00 call 55 <C_rowSums+0x55> | |
55: 48 89 c1 mov rcx,rax | |
58: e8 00 00 00 00 call 5d <C_rowSums+0x5d> | |
5d: 48 89 c1 mov rcx,rax | |
60: 48 89 c5 mov rbp,rax | |
63: e8 00 00 00 00 call 68 <C_rowSums+0x68> | |
68: 4c 89 e1 mov rcx,r12 | |
6b: 48 89 c3 mov rbx,rax | |
6e: e8 00 00 00 00 call 73 <C_rowSums+0x73> | |
73: 45 31 c0 xor r8d,r8d | |
76: 48 85 f6 test rsi,rsi | |
79: 48 89 c2 mov rdx,rax | |
7c: 7e 13 jle 91 <C_rowSums+0x91> | |
7e: 66 90 xchg ax,ax | |
80: 4a c7 04 c3 00 00 00 mov QWORD PTR [rbx+r8*8],0x0 | |
87: 00 | |
88: 49 83 c0 01 add r8,0x1 | |
8c: 4c 39 c6 cmp rsi,r8 | |
8f: 75 ef jne 80 <C_rowSums+0x80> | |
91: 45 31 c9 xor r9d,r9d | |
94: 48 85 ff test rdi,rdi | |
97: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] | |
9e: 00 | |
9f: 7e 36 jle d7 <C_rowSums+0xd7> | |
a1: 45 31 c0 xor r8d,r8d | |
a4: 48 85 f6 test rsi,rsi | |
a7: 7e 25 jle ce <C_rowSums+0xce> | |
a9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] | |
b0: f2 42 0f 10 04 c3 movsd xmm0,QWORD PTR [rbx+r8*8] | |
b6: f2 42 0f 58 04 c2 addsd xmm0,QWORD PTR [rdx+r8*8] | |
bc: f2 42 0f 11 04 c3 movsd QWORD PTR [rbx+r8*8],xmm0 | |
c2: 49 83 c0 01 add r8,0x1 | |
c6: 4c 39 c6 cmp rsi,r8 | |
c9: 75 e5 jne b0 <C_rowSums+0xb0> | |
cb: 48 01 c2 add rdx,rax | |
ce: 49 83 c1 01 add r9,0x1 | |
d2: 4c 39 cf cmp rdi,r9 | |
d5: 75 ca jne a1 <C_rowSums+0xa1> | |
d7: b9 01 00 00 00 mov ecx,0x1 | |
dc: e8 00 00 00 00 call e1 <C_rowSums+0xe1> | |
e1: 48 89 e8 mov rax,rbp | |
e4: 48 83 c4 20 add rsp,0x20 | |
e8: 5b pop rbx | |
e9: 5e pop rsi | |
ea: 5f pop rdi | |
eb: 5d pop rbp | |
ec: 41 5c pop r12 | |
ee: c3 ret | |
ef: 90 nop | |
00000000000000f0 <C_colSums>: | |
f0: 41 54 push r12 | |
f2: 55 push rbp | |
f3: 57 push rdi | |
f4: 56 push rsi | |
f5: 53 push rbx | |
f6: 48 83 ec 20 sub rsp,0x20 | |
fa: 48 8b 05 00 00 00 00 mov rax,QWORD PTR [rip+0x0] # 101 <C_colSums+0x11> | |
101: 49 89 cc mov r12,rcx | |
104: 48 8b 10 mov rdx,QWORD PTR [rax] | |
107: e8 00 00 00 00 call 10c <C_colSums+0x1c> | |
10c: 48 89 c1 mov rcx,rax | |
10f: e8 00 00 00 00 call 114 <C_colSums+0x24> | |
114: 48 89 c6 mov rsi,rax | |
117: 48 89 c1 mov rcx,rax | |
11a: e8 00 00 00 00 call 11f <C_colSums+0x2f> | |
11f: 48 89 f1 mov rcx,rsi | |
122: 48 63 18 movsxd rbx,DWORD PTR [rax] | |
125: e8 00 00 00 00 call 12a <C_colSums+0x3a> | |
12a: 48 63 70 04 movsxd rsi,DWORD PTR [rax+0x4] | |
12e: b9 01 00 00 00 mov ecx,0x1 | |
133: e8 00 00 00 00 call 138 <C_colSums+0x48> | |
138: b9 0e 00 00 00 mov ecx,0xe | |
13d: 48 89 f2 mov rdx,rsi | |
140: e8 00 00 00 00 call 145 <C_colSums+0x55> | |
145: 48 89 c1 mov rcx,rax | |
148: e8 00 00 00 00 call 14d <C_colSums+0x5d> | |
14d: 48 89 c1 mov rcx,rax | |
150: 48 89 c5 mov rbp,rax | |
153: e8 00 00 00 00 call 158 <C_colSums+0x68> | |
158: 4c 89 e1 mov rcx,r12 | |
15b: 48 89 c7 mov rdi,rax | |
15e: e8 00 00 00 00 call 163 <C_colSums+0x73> | |
163: 48 85 f6 test rsi,rsi | |
166: 49 89 c1 mov r9,rax | |
169: 7e 56 jle 1c1 <C_colSums+0xd1> | |
16b: 45 31 c0 xor r8d,r8d | |
16e: 66 90 xchg ax,ax | |
170: 4a c7 04 c7 00 00 00 mov QWORD PTR [rdi+r8*8],0x0 | |
177: 00 | |
178: 49 83 c0 01 add r8,0x1 | |
17c: 4c 39 c6 cmp rsi,r8 | |
17f: 75 ef jne 170 <C_colSums+0x80> | |
181: 66 0f ef c9 pxor xmm1,xmm1 | |
185: 48 8d 04 dd 00 00 00 lea rax,[rbx*8+0x0] | |
18c: 00 | |
18d: 45 31 d2 xor r10d,r10d | |
190: 45 31 c0 xor r8d,r8d | |
193: 48 85 db test rbx,rbx | |
196: 66 0f 28 c1 movapd xmm0,xmm1 | |
19a: 7e 16 jle 1b2 <C_colSums+0xc2> | |
19c: 0f 1f 40 00 nop DWORD PTR [rax+0x0] | |
1a0: f2 43 0f 58 04 c1 addsd xmm0,QWORD PTR [r9+r8*8] | |
1a6: 49 83 c0 01 add r8,0x1 | |
1aa: 4c 39 c3 cmp rbx,r8 | |
1ad: 75 f1 jne 1a0 <C_colSums+0xb0> | |
1af: 49 01 c1 add r9,rax | |
1b2: f2 42 0f 11 04 d7 movsd QWORD PTR [rdi+r10*8],xmm0 | |
1b8: 49 83 c2 01 add r10,0x1 | |
1bc: 4c 39 d6 cmp rsi,r10 | |
1bf: 75 cf jne 190 <C_colSums+0xa0> | |
1c1: b9 01 00 00 00 mov ecx,0x1 | |
1c6: e8 00 00 00 00 call 1cb <C_colSums+0xdb> | |
1cb: 48 89 e8 mov rax,rbp | |
1ce: 48 83 c4 20 add rsp,0x20 | |
1d2: 5b pop rbx | |
1d3: 5e pop rsi | |
1d4: 5f pop rdi | |
1d5: 5d pop rbp | |
1d6: 41 5c pop r12 | |
1d8: c3 ret | |
1d9: 90 nop | |
1da: 90 nop | |
1db: 90 nop | |
1dc: 90 nop | |
1dd: 90 nop | |
1de: 90 nop | |
1df: 90 nop |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
R CMD SHLIB matsums.c | |
objdump -d -M intel -S matsums.o > matsums.asm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Rinternals.h> | |
SEXP C_rowSums(SEXP x) { | |
SEXP dim = PROTECT(Rf_getAttrib(x, R_DimSymbol)); | |
R_xlen_t n = INTEGER(dim)[0]; | |
R_xlen_t p = INTEGER(dim)[1]; | |
UNPROTECT(1); | |
SEXP ans = PROTECT(Rf_allocVector(REALSXP, n)); | |
double *ap = REAL(ans), *xp = REAL(x); | |
for (R_xlen_t i = 0; i < n; i++) | |
ap[i] = 0.0; | |
for (R_xlen_t j = 0; j < p; j++) { | |
for (R_xlen_t i = 0; i < n; i++) { | |
ap[i] += *xp++; | |
} | |
} | |
UNPROTECT(1); | |
return ans; | |
} | |
SEXP C_colSums(SEXP x) { | |
SEXP dim = PROTECT(Rf_getAttrib(x, R_DimSymbol)); | |
R_xlen_t n = INTEGER(dim)[0]; | |
R_xlen_t p = INTEGER(dim)[1]; | |
UNPROTECT(1); | |
SEXP ans = PROTECT(Rf_allocVector(REALSXP, p)); | |
double *ap = REAL(ans), *xp = REAL(x); | |
for (R_xlen_t j = 0; j < p; j++) | |
ap[j] = 0.0; // just for similarity with rowSums | |
for (R_xlen_t j = 0; j < p; j++) { | |
double sum = 0.0; // R impl. uses long double | |
for (R_xlen_t i = 0; i < n; i++) { | |
sum += *xp++; | |
} | |
ap[j] = sum; | |
} | |
UNPROTECT(1); | |
return ans; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(microbenchmark) | |
C_colSums <- function(x) .Call("C_colSums", x) | |
C_rowSums <- function(x) .Call("C_rowSums", x) | |
set.seed(1234) | |
x <- tcrossprod(rnorm(5000)) | |
dyn.load("matsums.dll") | |
microbenchmark( | |
colSums(x), | |
C_colSums(x), | |
rowSums(x), | |
C_rowSums(x) | |
) | |
#> Unit: milliseconds | |
#> expr min lq mean median uq max neval | |
#> colSums(x) 23.70049 24.81208 25.41287 25.15936 25.82541 31.06992 100 | |
#> C_colSums(x) 31.44613 32.65331 33.33408 33.02726 33.73659 36.64243 100 | |
#> rowSums(x) 49.14745 52.80385 54.28015 54.14990 55.54067 62.06231 100 | |
#> C_rowSums(x) 18.78563 19.49311 20.13990 19.88757 20.44019 25.71116 100 | |
dyn.unload("matsums.dll") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment