September 9, 2015 21:40 · August 3, 2015 21:53 · August 3, 2015 21:51 · February 1, 2015 10:54 · December 22, 2014 23:30 · November 29, 2014 14:10
 ###  compare times for sample.int() vs internal function sample2()
 compareSampleTimes = function(popSizeList=c(1e5, 1e6, 1e7, 1e8, 1e9),
    sampleSizeList=c(10, 100, 1000, 10000),
    numReplications=1000) {
    for (sampleSize in sampleSizeList) {
        for (popSize in popSizeList)  {
            elapsed1 = system.time(replicate(numReplications, sample.int(popSize, sampleSize)))[["elapsed"]]
            elapsed2 = system.time(replicate(numReplications, .Internal(sample2(popSize, sampleSize))))[["elapsed"]]
            cat(sprintf("Sample %d from %.0e: %f vs %f seconds\n", sampleSize, popSize, elapsed1, elapsed2))
        }
 diff --git a/DESCRIPTION b/DESCRIPTION
 index cc23502..4d6e10c 100644
 --- a/DESCRIPTION
 +++ b/DESCRIPTION
 @@ -1,6 +1,6 @@
 Package: data.table
 -Version: 1.9.5
 -Title: Extension of Data.frame
 +Version: 1.9.5.1
 +Title: Extension of Data.frame (+KeepSource -ByteCompile)
 --- R-3.2.1/src/main/memory.c.orig      2015-07-31 23:15:07.017151621 -0700
 +++ R-3.2.1/src/main/memory.c   2015-07-31 23:17:10.185150073 -0700
 @@ -3724,11 +3724,21 @@
 static FILE *R_MemReportingOutfile;
 static R_size_t R_MemReportingThreshold;

 +static void printLineNum(FILE *file, SEXP srcref) {
 +  if (srcref && !isNull(srcref)) {
 +    int line = asInteger(srcref);
 +    fprintf(file, "#%d ", line);
 // gcc -march=native -g -std=gnu99 -Wall -Wextra -O3 symmetric.c -o symmetric -DUSE_ALG
 //   (where USE_ALG is one of USE_NATE, USE_KARIM, USE_BASIC, or USE_CONDITIONAL

 // Or if using https://code.google.com/p/likwid/ with -m markers:
 // gcc -march=native -g -std=gnu99 -Wall -Wextra -O3 symmetric.c -o symmetric -DLIKWID -llikwid -lpthread -lm -DUSE_ALG

 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 // C implementation for Pathfinding Benchmark by [email protected]
 // See https://github.com/logicchains/LPATHBench for details

 // Summary of benchmarks (see bottom for full numbers)
 // 8981 LANGUAGE C 623
 // 8981 LANGUAGE C++/clang 734
 // 8981 LANGUAGE C++/gcc 755
 // Best results compiling with GCC 4.7 or 4.8 -O2
 // clang, icc and GCC 4.9 slightly worse with -O1, -O2, -O3, -Ofast
 // -O3 and -Ofast much worse for all GCC.  -O1 mixed but worse.
 // Calculate cycles spent on overhead of function calls
 // See http://cs.coloradocollege.edu/~bylvisaker/CallReturn/
 // gcc -g -std=gnu99 -O3 -Wall -Wextra call-return.c -o call-return

 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>

 #define DEFAULT_LOOP_COUNT (1000 * 1000)
 // cc  -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra broadcast.c -o broadcast
 // works with 'gcc 4.8.2' and 'icc 14.03', but crashes with 'clang 3.4' because of alignment
 // usage: broadcast [-r repeat] [-s size]

 #ifdef LIKWID
 #include <likwid.h>
 #else
 #define likwid_markerInit()
 #define likwid_markerThreadInit()
 #define likwid_markerStartRegion(name)
 ; Minimal example, see also http://stackoverflow.com/q/26266953/3766665
 ; To build (Linux):
 ;   nasm -felf64 func.asm
 ;   ld func.o
 ; Then run:
 ;   perf stat -r10 ./a.out
 ; On Haswell and Sandy Bridge, observed runtime varies 
 ; ~15% depending on whether sub or sbb is used in the loop
 section .text
 global _start
 // gcc -std=gnu99 -O3 -Wall -Wextra same-function.c -o same-function
 // Identical loops that execute in different but consistent times

 #if COPY_AND_RUN_TO_TEST
 for n in 0 1 2 3 4 5 6 7 8 9;
 do echo same-function ${n}:;
 /usr/bin/time -f "%e seconds" same-function ${n};
 /usr/bin/time -f "%e seconds" same-function ${n};
 /usr/bin/time -f "%e seconds" same-function ${n};
 done
 GCC 4.8.1 
  4019f0:       66 0f 6f 00             movdqa (%rax),%xmm0
  4019f4:       48 83 c0 10             add    $0x10,%rax
  4019f8:       48 39 c5                cmp    %rax,%rbp
  4019fb:       66 0f 6f c8             movdqa %xmm0,%xmm1
  4019ff:       66 0f 6f e0             movdqa %xmm0,%xmm4
  401a03:       66 0f 62 c8             punpckldq %xmm0,%xmm1
  401a07:       66 0f 6a e0             punpckhdq %xmm0,%xmm4
  401a0b:       66 0f f4 cb             pmuludq %xmm3,%xmm1
  401a0f:       66 0f f4 e3             pmuludq %xmm3,%xmm4
	### compare times for sample.int() vs internal function sample2()
	compareSampleTimes = function(popSizeList=c(1e5, 1e6, 1e7, 1e8, 1e9),
	sampleSizeList=c(10, 100, 1000, 10000),
	numReplications=1000) {
	for (sampleSize in sampleSizeList) {
	for (popSize in popSizeList) {
	elapsed1 = system.time(replicate(numReplications, sample.int(popSize, sampleSize)))[["elapsed"]]
	elapsed2 = system.time(replicate(numReplications, .Internal(sample2(popSize, sampleSize))))[["elapsed"]]
	cat(sprintf("Sample %d from %.0e: %f vs %f seconds\n", sampleSize, popSize, elapsed1, elapsed2))
	}
	diff --git a/DESCRIPTION b/DESCRIPTION
	index cc23502..4d6e10c 100644
	--- a/DESCRIPTION
	+++ b/DESCRIPTION
	@@ -1,6 +1,6 @@
	Package: data.table
	-Version: 1.9.5
	-Title: Extension of Data.frame
	+Version: 1.9.5.1
	+Title: Extension of Data.frame (+KeepSource -ByteCompile)
	--- R-3.2.1/src/main/memory.c.orig 2015-07-31 23:15:07.017151621 -0700
	+++ R-3.2.1/src/main/memory.c 2015-07-31 23:17:10.185150073 -0700
	@@ -3724,11 +3724,21 @@
	static FILE *R_MemReportingOutfile;
	static R_size_t R_MemReportingThreshold;

	+static void printLineNum(FILE *file, SEXP srcref) {
	+ if (srcref && !isNull(srcref)) {
	+ int line = asInteger(srcref);
	+ fprintf(file, "#%d ", line);
	// gcc -march=native -g -std=gnu99 -Wall -Wextra -O3 symmetric.c -o symmetric -DUSE_ALG
	// (where USE_ALG is one of USE_NATE, USE_KARIM, USE_BASIC, or USE_CONDITIONAL

	// Or if using https://code.google.com/p/likwid/ with -m markers:
	// gcc -march=native -g -std=gnu99 -Wall -Wextra -O3 symmetric.c -o symmetric -DLIKWID -llikwid -lpthread -lm -DUSE_ALG

	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	// C implementation for Pathfinding Benchmark by [email protected]
	// See https://github.com/logicchains/LPATHBench for details

	// Summary of benchmarks (see bottom for full numbers)
	// 8981 LANGUAGE C 623
	// 8981 LANGUAGE C++/clang 734
	// 8981 LANGUAGE C++/gcc 755
	// Best results compiling with GCC 4.7 or 4.8 -O2
	// clang, icc and GCC 4.9 slightly worse with -O1, -O2, -O3, -Ofast
	// -O3 and -Ofast much worse for all GCC. -O1 mixed but worse.
	// Calculate cycles spent on overhead of function calls
	// See http://cs.coloradocollege.edu/~bylvisaker/CallReturn/
	// gcc -g -std=gnu99 -O3 -Wall -Wextra call-return.c -o call-return

	#include <stdio.h>
	#include <stdint.h>
	#include <stdlib.h>

	#define DEFAULT_LOOP_COUNT (1000 * 1000)
	// cc -fno-inline -g -march=native -std=gnu99 -O3 -Wall -Wextra broadcast.c -o broadcast
	// works with 'gcc 4.8.2' and 'icc 14.03', but crashes with 'clang 3.4' because of alignment
	// usage: broadcast [-r repeat] [-s size]

	#ifdef LIKWID
	#include <likwid.h>
	#else
	#define likwid_markerInit()
	#define likwid_markerThreadInit()
	#define likwid_markerStartRegion(name)
	; Minimal example, see also http://stackoverflow.com/q/26266953/3766665
	; To build (Linux):
	; nasm -felf64 func.asm
	; ld func.o
	; Then run:
	; perf stat -r10 ./a.out
	; On Haswell and Sandy Bridge, observed runtime varies
	; ~15% depending on whether sub or sbb is used in the loop
	section .text
	global _start
	// gcc -std=gnu99 -O3 -Wall -Wextra same-function.c -o same-function
	// Identical loops that execute in different but consistent times

	#if COPY_AND_RUN_TO_TEST
	for n in 0 1 2 3 4 5 6 7 8 9;
	do echo same-function ${n}:;
	/usr/bin/time -f "%e seconds" same-function ${n};
	/usr/bin/time -f "%e seconds" same-function ${n};
	/usr/bin/time -f "%e seconds" same-function ${n};
	done
	GCC 4.8.1
	4019f0: 66 0f 6f 00 movdqa (%rax),%xmm0
	4019f4: 48 83 c0 10 add $0x10,%rax
	4019f8: 48 39 c5 cmp %rax,%rbp
	4019fb: 66 0f 6f c8 movdqa %xmm0,%xmm1
	4019ff: 66 0f 6f e0 movdqa %xmm0,%xmm4
	401a03: 66 0f 62 c8 punpckldq %xmm0,%xmm1
	401a07: 66 0f 6a e0 punpckhdq %xmm0,%xmm4
	401a0b: 66 0f f4 cb pmuludq %xmm3,%xmm1
	401a0f: 66 0f f4 e3 pmuludq %xmm3,%xmm4