kevinushey · August 2, 2016 03:13
diff --git a/RcppMelt.md b/RcppMelt.md
diff --git a/RcppMelt.rmd b/RcppMelt.rmd
 Melting with Rcpp
 =================

 A common data manipulation task is that of making 'wide' data 'long':
 for example, using the `reshape2` library,

 ```{r}
 library(reshape2)
 wide_df <- data.frame( stringsAsFactors=FALSE,
  x=c('a', 'b'),
  y=c('A', 'B'),
  za=c(1, 2),
  zb=c(3, 4),
  zc=c(5, 6)
 )
 print(wide_df)
 print(melt(wide_df, id.vars=c('x', 'y')))
 ```

 We will try to re-implement this `melt` function using Rcpp.

 Let's define some variables:

 1. `id_ind` is the indices corresponding to our `id` variables (1 and 2 in this case),
 2. `value_ind` is the indices corresponding to our `value` variables (3, 4, 5 in this case),
 3. `n_id` is the number of `id` variables (2),
 4. `n_value` is the number of `value` variables (3).

 There are, essentially, three different operations constituting a melt:

 1. We repeat each `id` vector `n_value` times,
 2. We repeat each of the names of the `value` vectors `n_id` times,
 3. We concatenate the `value` vectors, coercing the type of each vector 
  to the 'highest' type if necessary.
  
 We store the output in a `data.frame` with our repeated `id` variables,
 a `variable` column built up of repeated `value` names, and a `value`
 column made up of the concatenated value vectors.

 `reshape2::melt` does this all in R code, but this forces some unnecessary
 copies of the `data.frame` and is hence very slow for large data sets. Let's
 try implementing an Rcpp version of `melt` for `data.frame`s.

 We'll start by simply implementing functions for each of our 3 cases above.

 ```{r, engine='Rcpp'}
 #include <Rcpp.h>
 using namespace Rcpp;

 // a define used later for error handling
 #define TYPE2STR(x) (CHAR(Rf_type2str(TYPEOF(x))))

 // 1 is easy since we already have the sugar 'rep' function; we'll see its use
 // later in the code

 // 2. We repeat the names of the `value` vectors `n_id` times,
 template <int RTYPE>
 Vector<RTYPE> rep_each( const Vector<RTYPE>& x, int each ) {
  
  int nn = x.size(); // length of x
  int n = nn * each; // length of our output vector
  Vector<RTYPE> output = no_init(n); // allocate the vector w/out initialization
  
  // now, loop through x, and assign elements to output as needed
  int counter = 0;
  for (int i=0; i < nn; ++i) {
    for (int j=0; j < each; ++j) {
      output[counter] = x[i];
      ++counter;
    }
  }
  return output;
 }

 // 3. We concatenate the `value` vectors, coercing the type of each vector 
 // to the 'highest' type if necessary.

 // note the signature: we pass a DataFrame by reference, as well as the indices
 // for the vectors we care about. This way we avoid passing in copies of the
 // vectors; instead, we pass in references from which we can build our output.

 // n_value and n_row let us calculate the size of our output vector
 template <int RTYPE>
 Vector<RTYPE> concatenate( const DataFrame& x, 
               const IntegerVector& value_ind, 
               int n_value, 
               int n_row ) {
  
  int n = n_value * n_row;
  Vector<RTYPE> output = no_init(n);
  
  int counter = 0;
  for (int i=0; i < n_value; ++i) {
    // generate a reference to each vector within x, as tmp
    Vector<RTYPE> tmp = as< Vector<RTYPE> >(x[ value_ind[i] ]);
    for (int j=0; j < n_row; ++j) {
      output[counter] = tmp[j];
      ++counter;
    }
  }
  return output;
 }

 // now, we put everything together into a melt function
 // [[Rcpp::export]]
 List meltRcpp( DataFrame x,
               IntegerVector id_ind, 
               IntegerVector value_ind, 
               String variable_name, 
               String value_name ) {
  
  int n_id = id_ind.length();
  int n_value = value_ind.length();
  int n_row = x.nrows();
  
  // Allocate the output data.frame
  // size is equal to the number of id.vars + 2 (variable, value columns)
  List output(n_id+2);
  
  // id.vars
  // We repeat each 'id' vector n_value times, and assign them immediately
  // after generation
  
  // A define to handle the different possible types
 #define REP(RTYPE) \
  case RTYPE: { \
    output[ id_ind[i] ] = rep( as< Vector<RTYPE> >(x[ id_ind[i] ]), n_value); \
    break; \
  } \
  
  for (int i=0; i < n_id; ++i) {
    switch( TYPEOF(x[id_ind[i]]) ) {
    REP(LGLSXP);
    REP(INTSXP);
    REP(REALSXP);
    REP(STRSXP);
    default: {
      Rf_error("Unsupported RTYPE '%s'", TYPE2STR( x[id_ind[i]] ));
    }
    }
  }
  
  // names
  // We repeat each 'name' for the value variables n_row times
  
  // first, we do a bit of work to pull out the names we want
  CharacterVector names = as<CharacterVector>(x.attr("names"));
  CharacterVector val_names = no_init(n_value);
  for (int i=0; i < n_value; ++i) {
    val_names[i] = names[ value_ind[i] ];
  }
  
  // now, we can repeat each of these names
  output[n_id] = rep_each(val_names, n_row);
  
  // value.vars
  // We concatenate all of the value variables
  // we assume all the value variables are of the same type;
  // an exercise is to coerce the value vectors
  
 #define CONCATENATE(RTYPE) \
  case RTYPE: { \
    output[n_id+1] = concatenate<RTYPE>(x, value_ind, n_value, n_row); \
    break; \
  } \
  
  switch( TYPEOF(x[value_ind[0]]) ) {
    CONCATENATE(LGLSXP);
    CONCATENATE(INTSXP);
    CONCATENATE(REALSXP);
    CONCATENATE(STRSXP);
    default: {
      Rf_error("Unsupported vector type '%s'", TYPE2STR( x[value_ind[0]] ));
    }
  }
  
  // set some attributes
  CharacterVector out_names = no_init(n_id + 2);
  for (int i=0; i < n_id; ++i) {
    out_names[i] = names[ id_ind[i] ];
  }
  out_names[n_id] = variable_name;
  out_names[n_id+1] = value_name;
  output.attr("names") = out_names;
  
  int out_nrow = n_row * n_value;
  IntegerVector row_names = no_init(out_nrow);
  for (int i=0; i < out_nrow; ++i) {
    row_names[i] = i+1;
  }
  output.attr("row.names") = row_names;
  output.attr("class") = "data.frame";
  
  // finally, we return
  return output;
 }
 ```

 Note one major deficiency is that we do not check or coerce our value variables;
 this is left as an exercise.

 Now, let's test and benchmark our `meltRcpp` implementation. Note that, since
 arrays in C++ are zero indexed, we have to subtract one from the
 regular indexes we pass. Ideally, we would wrap this into an R function
 that provides a similar `melt` interface, but that's outside of Rcpp.

 ```{r}
 all.equal(
  melt(wide_df, id.vars=c('x', 'y')),
  meltRcpp(wide_df, (0:1), (2:4), "variable", "value")
 )
 ## note that we don't coerce the 'variable' column to factor, hence
 ## all.equal not being TRUE -- all the values do match, however
 ```

 And a benchmark, for a larger DF:

 ```{r}
 library(microbenchmark)
 n <- 1E5
 big_df <- data.frame( stringsAsFactors=FALSE,
  x=sample(letters, n, TRUE),
  y=sample(LETTERS, n, TRUE),
  za=rnorm(n),
  zb=rnorm(n),
  zc=rnorm(n)
 )
 all.equal(
  melt(big_df, id.vars=c('x', 'y')),
  meltRcpp(big_df, (0:1), (2:4), "variable", "value")
 )
 microbenchmark( times=10,
  melt=melt(big_df, id.vars=c('x', 'y')),
  meltRcpp=meltRcpp(big_df, (0:1), (2:4), "variable", "value")
 ) 
 ```

 There are still a lot of holes in the implementation we have (e.g.
 more error checking is warranted, type coercion for the 'value' variables
 is necessary, some extra work is needed to properly handle factors, we might
 want to copy attributes from our original vectors / df), but
 we can see that our efforts have been very fruitful -- we see a ~10x increase
 in speed in our test case.
	Melting with Rcpp
	=================

	A common data manipulation task is that of making 'wide' data 'long':
	for example, using the `reshape2` library,

	```{r}
	library(reshape2)
	wide_df <- data.frame( stringsAsFactors=FALSE,
	x=c('a', 'b'),
	y=c('A', 'B'),
	za=c(1, 2),
	zb=c(3, 4),
	zc=c(5, 6)
	)
	print(wide_df)
	print(melt(wide_df, id.vars=c('x', 'y')))
	```

	We will try to re-implement this `melt` function using Rcpp.

	Let's define some variables:

	1. `id_ind` is the indices corresponding to our `id` variables (1 and 2 in this case),
	2. `value_ind` is the indices corresponding to our `value` variables (3, 4, 5 in this case),
	3. `n_id` is the number of `id` variables (2),
	4. `n_value` is the number of `value` variables (3).

	There are, essentially, three different operations constituting a melt:

	1. We repeat each `id` vector `n_value` times,
	2. We repeat each of the names of the `value` vectors `n_id` times,
	3. We concatenate the `value` vectors, coercing the type of each vector
	to the 'highest' type if necessary.

	We store the output in a `data.frame` with our repeated `id` variables,
	a `variable` column built up of repeated `value` names, and a `value`
	column made up of the concatenated value vectors.

	`reshape2::melt` does this all in R code, but this forces some unnecessary
	copies of the `data.frame` and is hence very slow for large data sets. Let's
	try implementing an Rcpp version of `melt` for `data.frame`s.

	We'll start by simply implementing functions for each of our 3 cases above.

	```{r, engine='Rcpp'}
	#include <Rcpp.h>
	using namespace Rcpp;

	// a define used later for error handling
	#define TYPE2STR(x) (CHAR(Rf_type2str(TYPEOF(x))))

	// 1 is easy since we already have the sugar 'rep' function; we'll see its use
	// later in the code

	// 2. We repeat the names of the `value` vectors `n_id` times,
	template <int RTYPE>
	Vector<RTYPE> rep_each( const Vector<RTYPE>& x, int each ) {

	int nn = x.size(); // length of x
	int n = nn * each; // length of our output vector
	Vector<RTYPE> output = no_init(n); // allocate the vector w/out initialization

	// now, loop through x, and assign elements to output as needed
	int counter = 0;
	for (int i=0; i < nn; ++i) {
	for (int j=0; j < each; ++j) {
	output[counter] = x[i];
	++counter;
	}
	}
	return output;
	}

	// 3. We concatenate the `value` vectors, coercing the type of each vector
	// to the 'highest' type if necessary.

	// note the signature: we pass a DataFrame by reference, as well as the indices
	// for the vectors we care about. This way we avoid passing in copies of the
	// vectors; instead, we pass in references from which we can build our output.

	// n_value and n_row let us calculate the size of our output vector
	template <int RTYPE>
	Vector<RTYPE> concatenate( const DataFrame& x,
	const IntegerVector& value_ind,
	int n_value,
	int n_row ) {

	int n = n_value * n_row;
	Vector<RTYPE> output = no_init(n);

	int counter = 0;
	for (int i=0; i < n_value; ++i) {
	// generate a reference to each vector within x, as tmp
	Vector<RTYPE> tmp = as< Vector<RTYPE> >(x[ value_ind[i] ]);
	for (int j=0; j < n_row; ++j) {
	output[counter] = tmp[j];
	++counter;
	}
	}
	return output;
	}

	// now, we put everything together into a melt function
	// [[Rcpp::export]]
	List meltRcpp( DataFrame x,
	IntegerVector id_ind,
	IntegerVector value_ind,
	String variable_name,
	String value_name ) {

	int n_id = id_ind.length();
	int n_value = value_ind.length();
	int n_row = x.nrows();

	// Allocate the output data.frame
	// size is equal to the number of id.vars + 2 (variable, value columns)
	List output(n_id+2);

	// id.vars
	// We repeat each 'id' vector n_value times, and assign them immediately
	// after generation

	// A define to handle the different possible types
	#define REP(RTYPE) \
	case RTYPE: { \
	output[ id_ind[i] ] = rep( as< Vector<RTYPE> >(x[ id_ind[i] ]), n_value); \
	break; \
	} \

	for (int i=0; i < n_id; ++i) {
	switch( TYPEOF(x[id_ind[i]]) ) {
	REP(LGLSXP);
	REP(INTSXP);
	REP(REALSXP);
	REP(STRSXP);
	default: {
	Rf_error("Unsupported RTYPE '%s'", TYPE2STR( x[id_ind[i]] ));
	}
	}
	}

	// names
	// We repeat each 'name' for the value variables n_row times

	// first, we do a bit of work to pull out the names we want
	CharacterVector names = as<CharacterVector>(x.attr("names"));
	CharacterVector val_names = no_init(n_value);
	for (int i=0; i < n_value; ++i) {
	val_names[i] = names[ value_ind[i] ];
	}

	// now, we can repeat each of these names
	output[n_id] = rep_each(val_names, n_row);

	// value.vars
	// We concatenate all of the value variables
	// we assume all the value variables are of the same type;
	// an exercise is to coerce the value vectors

	#define CONCATENATE(RTYPE) \
	case RTYPE: { \
	output[n_id+1] = concatenate<RTYPE>(x, value_ind, n_value, n_row); \
	break; \
	} \

	switch( TYPEOF(x[value_ind[0]]) ) {
	CONCATENATE(LGLSXP);
	CONCATENATE(INTSXP);
	CONCATENATE(REALSXP);
	CONCATENATE(STRSXP);
	default: {
	Rf_error("Unsupported vector type '%s'", TYPE2STR( x[value_ind[0]] ));
	}
	}

	// set some attributes
	CharacterVector out_names = no_init(n_id + 2);
	for (int i=0; i < n_id; ++i) {
	out_names[i] = names[ id_ind[i] ];
	}
	out_names[n_id] = variable_name;
	out_names[n_id+1] = value_name;
	output.attr("names") = out_names;

	int out_nrow = n_row * n_value;
	IntegerVector row_names = no_init(out_nrow);
	for (int i=0; i < out_nrow; ++i) {
	row_names[i] = i+1;
	}
	output.attr("row.names") = row_names;
	output.attr("class") = "data.frame";

	// finally, we return
	return output;
	}
	```

	Note one major deficiency is that we do not check or coerce our value variables;
	this is left as an exercise.

	Now, let's test and benchmark our `meltRcpp` implementation. Note that, since
	arrays in C++ are zero indexed, we have to subtract one from the
	regular indexes we pass. Ideally, we would wrap this into an R function
	that provides a similar `melt` interface, but that's outside of Rcpp.

	```{r}
	all.equal(
	melt(wide_df, id.vars=c('x', 'y')),
	meltRcpp(wide_df, (0:1), (2:4), "variable", "value")
	)
	## note that we don't coerce the 'variable' column to factor, hence
	## all.equal not being TRUE -- all the values do match, however
	```

	And a benchmark, for a larger DF:

	```{r}
	library(microbenchmark)
	n <- 1E5
	big_df <- data.frame( stringsAsFactors=FALSE,
	x=sample(letters, n, TRUE),
	y=sample(LETTERS, n, TRUE),
	za=rnorm(n),
	zb=rnorm(n),
	zc=rnorm(n)
	)
	all.equal(
	melt(big_df, id.vars=c('x', 'y')),
	meltRcpp(big_df, (0:1), (2:4), "variable", "value")
	)
	microbenchmark( times=10,
	melt=melt(big_df, id.vars=c('x', 'y')),
	meltRcpp=meltRcpp(big_df, (0:1), (2:4), "variable", "value")
	)
	```

	There are still a lot of holes in the implementation we have (e.g.
	more error checking is warranted, type coercion for the 'value' variables
	is necessary, some extra work is needed to properly handle factors, we might
	want to copy attributes from our original vectors / df), but
	we can see that our efforts have been very fruitful -- we see a ~10x increase
	in speed in our test case.