Skip to content

Instantly share code, notes, and snippets.

diff --git a/DESCRIPTION b/DESCRIPTION
index cc23502..4d6e10c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: data.table
-Version: 1.9.5
-Title: Extension of Data.frame
+Version: 1.9.5.1
+Title: Extension of Data.frame (+KeepSource -ByteCompile)
### compare times for sample.int() vs internal function sample2()
compareSampleTimes = function(popSizeList=c(1e5, 1e6, 1e7, 1e8, 1e9),
sampleSizeList=c(10, 100, 1000, 10000),
numReplications=1000) {
for (sampleSize in sampleSizeList) {
for (popSize in popSizeList) {
elapsed1 = system.time(replicate(numReplications, sample.int(popSize, sampleSize)))[["elapsed"]]
elapsed2 = system.time(replicate(numReplications, .Internal(sample2(popSize, sampleSize))))[["elapsed"]]
cat(sprintf("Sample %d from %.0e: %f vs %f seconds\n", sampleSize, popSize, elapsed1, elapsed2))
}
bestAllocation = function(treatedList=c(0,1,8,39,152), # treated in each category
totalsList=rep(200, 5), # treated + untreated in each
numToAdd=100) { # number new treated available
addedList = rep(0, length(treatedList)) # start with nothing added
while (numToAdd > 0) {
ratio = (treatedList + addedList) / (totalsList + addedList)
lowest = which.min(ratio)
addedList[[lowest]] = addedList[[lowest]] + 1
numToAdd = numToAdd - 1
}
@nkurz
nkurz / l1d.c
Created December 26, 2015 23:32
Are sustained loads of 64B per cycle possible on Haswell and Skylake?
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native l1d.c -o l1d
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
@nkurz
nkurz / avx.c
Created December 27, 2015 23:41
Alignment strongly affects vector load bandwidth
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native avx.c -o avx
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include <math.h>
#include <malloc.h>
/* function trace of one iteration from http://nicst.de/bench-user-irq-detect.html */
/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
/* ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
timer-5372 [003] .... 382853.609575: syscall_trace_enter_phase1 <-tracesys
timer-5372 [003] .... 382853.609575: context_tracking_user_exit <-syscall_trace_enter_phase1
timer-5372 [003] .... 382853.609575: context_tracking_exit <-context_tracking_user_exit
timer-5372 [003] d... 382853.609576: context_tracking_recursion_enter <-context_tracking_exit
timer-5372 [003] d... 382853.609576: rcu_user_exit <-context_tracking_exit
timer-5372 [003] d... 382853.609576: vtime_account_user <-context_tracking_exit
/* function_graph trace of one iteration from http://nicst.de/bench-user-irq-detect.html */
/* lxdetectirq_thread_capture_start(struct lxdetectirq_capture const * const c) */
/* ioctl(c->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) */
381887.380150 | 3) | syscall_trace_enter_phase1() {
381887.380150 | 3) | context_tracking_user_exit() {
381887.380151 | 3) | context_tracking_exit() {
381887.380151 | 3) 0.025 us | context_tracking_recursion_enter();
381887.380151 | 3) 0.026 us | rcu_user_exit();
381887.380151 | 3) | vtime_account_user() {
@nkurz
nkurz / Results Haswell
Created July 12, 2016 03:37
Differences in macro- and micro-fusion performance Skylake vs Haswell
nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
-------------------------------------------------------------
-------------------------------------------------------------
CPU type: Intel Core Haswell processor
CPU clock: 3.39 GHz
-------------------------------------------------------------
fusion
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999
gcc-4.8 -O0 fft-test-portable
Self-test passed
Size Time per FFT (ns)
4 min=84 mean=84 sd=0.01%
16 min=540 mean=541 sd=0.03%
64 min=3035 mean=3037 sd=0.13%
256 min=15759 mean=15763 sd=0.02%
1024 min=77969 mean=77984 sd=0.01%
4096 min=375086 mean=375292 sd=0.04%
16384 min=1765013 mean=1765401 sd=0.02%
gcc-4.8 -O0 fft-test-model
Self-test passed
Size Time per FFT (ns)
4 min=47 mean=47 sd=0.01%
16 min=411 mean=412 sd=0.07%
64 min=2524 mean=2525 sd=0.03%
256 min=13656 mean=13661 sd=0.02%
1024 min=69013 mean=69024 sd=0.01%
4096 min=337457 mean=337591 sd=0.02%
16384 min=1584876 mean=1585286 sd=0.02%