Last active
December 26, 2015 11:49
-
-
Save romainfrancois/7146128 to your computer and use it in GitHub Desktop.
Benchmark sorted vs. unsorted grouped summarise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <dplyr.h> | |
// [[Rcpp::depends(BH,dplyr)]] | |
using namespace Rcpp ; | |
using namespace dplyr ; | |
class IndexHelper { | |
public: | |
IndexHelper( int pos_, int n_) : pos(pos_), n(n_){} | |
inline int size() const { return n ; } | |
inline int operator[](int i) const { return pos + i ; } | |
private: | |
int pos, n ; | |
} ; | |
// [[Rcpp::export]] | |
SEXP summarise_not_sorted( GroupedDataFrame gdf ){ | |
DataFrame df = gdf.data() ; | |
int n = gdf.ngroups() ; | |
NumericVector out(n); | |
Language call( "mean_", Rf_install("AB") ); | |
Language::Proxy proxy(call,1) ; | |
IntegerVector AB = df["AB"] ; | |
Armor<SEXP> res ; | |
for( int i=0; i<n; i++){ | |
proxy = wrap_subset<INTSXP>(AB, gdf.group(i) ) ; | |
res = call.fast_eval() ; | |
out[i] = REAL(res)[0]; | |
} | |
return out ; | |
} | |
// [[Rcpp::export]] | |
SEXP summarise_not_sorted_less_allocations( GroupedDataFrame gdf, int biggest ){ | |
IntegerVector chunk( biggest ); | |
int* chunk_start = chunk.begin() ; | |
DataFrame df = gdf.data() ; | |
int n = gdf.ngroups() ; | |
NumericVector out(n); | |
Language call( "mean_", chunk ); | |
IntegerVector AB = df["AB"] ; | |
int* ptr = AB.begin() ; | |
Armor<SEXP> res; | |
for( int i=0; i<n; i++){ | |
Index_0_based index = gdf.group(i) ; | |
int chunk_size = index.size() ; | |
SETLENGTH(chunk, chunk_size ) ; | |
for( int j=0; j<chunk_size; j++) | |
chunk_start[j] = ptr[index[j] ] ; | |
res = call.fast_eval() ; | |
out[i] = REAL(res)[0]; | |
} | |
SETLENGTH(chunk, biggest) ; | |
return out ; | |
} | |
// [[Rcpp::export]] | |
SEXP summarise_not_sorted_internal( GroupedDataFrame gdf ){ | |
DataFrame df = gdf.data() ; | |
int n = gdf.ngroups() ; | |
NumericVector out(n); | |
IntegerVector AB = df["AB"] ; int* ptr = AB.begin() ; | |
for( int i=0; i<n; i++){ | |
out[i] = dplyr::internal::Mean_internal<INTSXP,false,Index_0_based>::process( ptr, gdf.group(i) ) ; | |
} | |
return out ; | |
} | |
// [[Rcpp::export]] | |
List helper_group_sizes( IntegerVector groups, int ngroups){ | |
IntegerVector out(ngroups) ; | |
int n = groups.size() ; | |
int biggest = 0 ; | |
for( int i=0, j=0; i<ngroups; i++){ | |
int current = groups[j] ; | |
int count = 0 ; | |
while( groups[j] == current){ | |
j++; | |
count++ ; | |
} | |
out[i] = count ; | |
biggest = std::max(biggest, count) ; | |
} | |
return List::create( out, biggest ) ; | |
} | |
// [[Rcpp::export]] | |
SEXP summarise_sorted( DataFrame df, IntegerVector sizes, int biggest ){ | |
IntegerVector chunk( biggest ); | |
int* chunk_start = chunk.begin() ; | |
int n = sizes.size() ; | |
NumericVector out(n); | |
IntegerVector AB = df["AB"] ; | |
int* ptr = AB.begin() ; | |
Language call( "mean_", chunk ); | |
int pos = 0 ; | |
Armor<SEXP> res ; | |
for( int i=0; i<n; i++){ | |
SETLENGTH( chunk, sizes[i] ) ; | |
memcpy( chunk_start, ptr + pos, sizes[i] * sizeof(int) ) ; | |
res = call.fast_eval() ; | |
out[i] = REAL(res)[0] ; | |
pos += sizes[i] ; | |
} | |
SETLENGTH(chunk, biggest) ; | |
return out ; | |
} | |
// [[Rcpp::export]] | |
SEXP summarise_sorted_internal( DataFrame df, IntegerVector sizes, int biggest ){ | |
int n = sizes.size() ; | |
NumericVector out(n); | |
IntegerVector AB = df["AB"] ; | |
int* ptr = AB.begin() ; | |
int pos = 0 ; | |
for( int i=0; i<n; i++){ | |
out[i] = dplyr::internal::Mean_internal<INTSXP,false,IndexHelper>::process( ptr, IndexHelper(pos, sizes[i] ) ) ; | |
pos += sizes[i] ; | |
} | |
return out ; | |
} | |
/*** R | |
library(Rcpp) | |
library(dplyr) | |
library(microbenchmark) | |
library(data.table) | |
library(Lahman) | |
batting_cpp <- tbl_cpp(Batting) | |
players_cpp <- group_by(batting_cpp, playerID) | |
players_cpp_sorted <- arrange( batting_cpp, playerID) | |
mean_ <- function(.) .Internal(mean(.)) | |
groups <- match(players_cpp_sorted$playerID, players_cpp_sorted$playerID) | |
ngroups <- length( attr( players_cpp, "index" ) ) | |
helps <- helper_group_sizes( groups, ngroups ) | |
group_sizes <- helps[[1]] | |
biggest <- helps[[2]] | |
res1 <- sort( summarise_not_sorted( players_cpp ) ) | |
res2 <- sort( summarise_sorted( players_cpp_sorted, group_sizes, biggest ) ) | |
identical(res1, res2) | |
options( width = 150, digits= 4 ) | |
microbenchmark( | |
arrange = arrange( batting_cpp, playerID), | |
unsorted = summarise_not_sorted( players_cpp ), | |
unsorted_less_alloc = summarise_not_sorted_less_allocations( players_cpp, biggest ), | |
sorted = summarise_sorted( players_cpp_sorted, group_sizes, biggest ), | |
unsorted_internal = summarise_not_sorted_internal( players_cpp ), | |
sorted_internal = summarise_sorted_internal( players_cpp_sorted, group_sizes, biggest ) | |
) | |
*/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> microbenchmark(arrange = arrange(batting_cpp, playerID), | |
+ unsorted = summarise_not_sorted(players_cpp), unsorted_less_alloc = summarise_not_so .... [TRUNCATED] | |
Unit: microseconds | |
expr min lq median uq max neval | |
arrange 10593.2 11060.1 11527.4 13168.2 50620.5 100 | |
unsorted 12194.0 13007.9 13932.9 14681.5 50063.7 100 | |
unsorted_less_alloc 10379.6 11053.8 11384.9 12434.8 49670.9 100 | |
sorted 8908.2 9283.6 9426.6 10703.8 49739.7 100 | |
unsorted_internal 877.9 1051.2 1200.2 1363.1 1459.1 100 | |
sorted_internal 612.2 631.5 672.7 695.2 800.9 100 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment