SteveBronder · December 9, 2020 18:36
diff --git a/elt_multiply.cpp b/elt_multiply.cpp
 #include <benchmark/benchmark.h>
 #include <stan/math.hpp>
 #include <utility>
 #include "toss_me.hpp"

 template <typename T1, typename T2>
 static void multiply_vector_vector(benchmark::State& state) {
  using stan::math::var;
  using stan::math::promote_scalar;
  Eigen::MatrixXd x_val = Eigen::VectorXd::Random(state.range(0));
  Eigen::VectorXd y_val = Eigen::VectorXd::Random(state.range(0));
  for (auto _ : state) {
    auto x = promote_scalar<T1>(x_val);
    auto y = promote_scalar<T2>(y_val);
    auto start = std::chrono::high_resolution_clock::now();
    var lp += sum(elt_multiply(x, y));
    lp.grad();
    benchmark::ClobberMemory();
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed_seconds =
      std::chrono::duration_cast<std::chrono::duration<double>>(end - start);

    state.SetIterationTime(elapsed_seconds.count());
    stan::math::recover_memory();
    benchmark::ClobberMemory();
  }
 }

 template <typename T1, typename T2>
 static void multiply_matrix_matrix(benchmark::State& state) {
  using stan::math::var;
  using stan::math::promote_scalar;
  Eigen::MatrixXd x_val = Eigen::MatrixXd::Random(state.range(0), state.range(0));
  Eigen::MatrixXd y_val = Eigen::MatrixXd::Random(state.range(0), state.range(0));
  for (auto _ : state) {
    auto x = promote_scalar<T1>(x_val);
    auto y = promote_scalar<T2>(y_val);
    auto start = std::chrono::high_resolution_clock::now();
    var lp += sum(elt_multiply(x, y));
    lp.grad();
    benchmark::ClobberMemory();
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed_seconds =
      std::chrono::duration_cast<std::chrono::duration<double>>(end - start);

    state.SetIterationTime(elapsed_seconds.count());
    stan::math::recover_memory();
    benchmark::ClobberMemory();
  }
 }


 using stan::math::var;
 // The start and ending sizes for the benchmark
 constexpr int start_val = 2;
 constexpr int end_val = 16384;
 constexpr int end_mem = 16384 * 1.2;
 BENCHMARK_TEMPLATE(toss_me, end_mem);
 BENCHMARK_TEMPLATE(multiply_vector_vector, double, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
 BENCHMARK_TEMPLATE(multiply_vector_vector, var, double)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
 BENCHMARK_TEMPLATE(multiply_vector_vector, var, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
 BENCHMARK_TEMPLATE(multiply_matrix_matrix, double, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
 BENCHMARK_TEMPLATE(multiply_matrix_matrix, var, double)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
 BENCHMARK_TEMPLATE(multiply_matrix_matrix, var, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
 BENCHMARK_MAIN();
diff --git a/toss_me.hpp b/toss_me.hpp
 #ifndef CALLBACK_TOSS_ME_HPP
 #define CALLBACK_TOSS_ME_HPP

 #include <benchmark/benchmark.h>
 #include <stan/math.hpp>
 #include <utility>

 static bool needs_done = true;
 // Just to fill up the stack allocator
 template <int max_alloc>
 static void toss_me(benchmark::State& state) {
  using stan::math::var;
  if (needs_done) {
    needs_done = false;
    using stan::math::var;
    using stan::math::sum;
    Eigen::Matrix<var, -1, -1> x(Eigen::MatrixXd::Random(max_alloc, max_alloc));
    Eigen::Matrix<var, -1, -1> y(Eigen::MatrixXd::Random(max_alloc, max_alloc));
    Eigen::Matrix<var, -1, -1> z = elt_multiply(x, y);
    var lp = sum(z);
    benchmark::DoNotOptimize(lp.vi_);
    for (auto _ : state) {
      lp.grad();
      stan::math::set_zero_all_adjoints();
    }
    stan::math::recover_memory();
  }
 }

 #endif
	#include <benchmark/benchmark.h>
	#include <stan/math.hpp>
	#include <utility>
	#include "toss_me.hpp"

	template <typename T1, typename T2>
	static void multiply_vector_vector(benchmark::State& state) {
	using stan::math::var;
	using stan::math::promote_scalar;
	Eigen::MatrixXd x_val = Eigen::VectorXd::Random(state.range(0));
	Eigen::VectorXd y_val = Eigen::VectorXd::Random(state.range(0));
	for (auto _ : state) {
	auto x = promote_scalar<T1>(x_val);
	auto y = promote_scalar<T2>(y_val);
	auto start = std::chrono::high_resolution_clock::now();
	var lp += sum(elt_multiply(x, y));
	lp.grad();
	benchmark::ClobberMemory();
	auto end = std::chrono::high_resolution_clock::now();
	auto elapsed_seconds =
	std::chrono::duration_cast<std::chrono::duration<double>>(end - start);

	state.SetIterationTime(elapsed_seconds.count());
	stan::math::recover_memory();
	benchmark::ClobberMemory();
	}
	}

	template <typename T1, typename T2>
	static void multiply_matrix_matrix(benchmark::State& state) {
	using stan::math::var;
	using stan::math::promote_scalar;
	Eigen::MatrixXd x_val = Eigen::MatrixXd::Random(state.range(0), state.range(0));
	Eigen::MatrixXd y_val = Eigen::MatrixXd::Random(state.range(0), state.range(0));
	for (auto _ : state) {
	auto x = promote_scalar<T1>(x_val);
	auto y = promote_scalar<T2>(y_val);
	auto start = std::chrono::high_resolution_clock::now();
	var lp += sum(elt_multiply(x, y));
	lp.grad();
	benchmark::ClobberMemory();
	auto end = std::chrono::high_resolution_clock::now();
	auto elapsed_seconds =
	std::chrono::duration_cast<std::chrono::duration<double>>(end - start);

	state.SetIterationTime(elapsed_seconds.count());
	stan::math::recover_memory();
	benchmark::ClobberMemory();
	}
	}


	using stan::math::var;
	// The start and ending sizes for the benchmark
	constexpr int start_val = 2;
	constexpr int end_val = 16384;
	constexpr int end_mem = 16384 * 1.2;
	BENCHMARK_TEMPLATE(toss_me, end_mem);
	BENCHMARK_TEMPLATE(multiply_vector_vector, double, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
	BENCHMARK_TEMPLATE(multiply_vector_vector, var, double)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
	BENCHMARK_TEMPLATE(multiply_vector_vector, var, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
	BENCHMARK_TEMPLATE(multiply_matrix_matrix, double, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
	BENCHMARK_TEMPLATE(multiply_matrix_matrix, var, double)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
	BENCHMARK_TEMPLATE(multiply_matrix_matrix, var, var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();
	BENCHMARK_MAIN();
	#ifndef CALLBACK_TOSS_ME_HPP
	#define CALLBACK_TOSS_ME_HPP

	#include <benchmark/benchmark.h>
	#include <stan/math.hpp>
	#include <utility>

	static bool needs_done = true;
	// Just to fill up the stack allocator
	template <int max_alloc>
	static void toss_me(benchmark::State& state) {
	using stan::math::var;
	if (needs_done) {
	needs_done = false;
	using stan::math::var;
	using stan::math::sum;
	Eigen::Matrix<var, -1, -1> x(Eigen::MatrixXd::Random(max_alloc, max_alloc));
	Eigen::Matrix<var, -1, -1> y(Eigen::MatrixXd::Random(max_alloc, max_alloc));
	Eigen::Matrix<var, -1, -1> z = elt_multiply(x, y);
	var lp = sum(z);
	benchmark::DoNotOptimize(lp.vi_);
	for (auto _ : state) {
	lp.grad();
	stan::math::set_zero_all_adjoints();
	}
	stan::math::recover_memory();
	}
	}

	#endif