SteveBronder · September 23, 2020 01:17
diff --git a/lgamma_par_bench.cpp b/lgamma_par_bench.cpp
 #include <benchmark/benchmark.h>
 #include <stan/math.hpp>

 static void exp_ParMap_dbl(benchmark::State &state) {
  Eigen::Matrix<double, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
  Eigen::Matrix<double, -1, 1> m_res(state.range(0));
  // Functor defining how inputs should be indexed
  auto ind_f = [](auto&& i, auto&& j, auto&& fun, auto&& x) {
    return fun(x.segment(i, j));
  };

  // Functor defining function to be applied to indexed arguments
  auto f = [](auto&& x) { return stan::math::lgamma(x); };

  for (auto _ : state) {
    auto m_res = stan::math::parallel_map(f, ind_f, state.range(1), m_d);
  }
 }

 static void exp_ParMap_var(benchmark::State &state) {
  using stan::math::var;
    // Functor defining how inputs should be indexed
    auto ind_f = [](auto&& i, auto&& fun, auto&& x) {
      return fun(x.coeffRef(i));
    };

    // Functor defining function to be applied to indexed arguments
    auto f = [](auto&& x) { return stan::math::lgamma(x); };
  for (auto _ : state) {
    Eigen::Matrix<var, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
    auto start = std::chrono::high_resolution_clock::now();
    Eigen::Matrix<var, -1, 1> m_res = stan::math::parallel_map(f, ind_f, state.range(1), m_d);
    var sum_toss = stan::math::sum(m_res);
    sum_toss.grad();
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed_seconds =
      std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
    state.SetIterationTime(elapsed_seconds.count());
    stan::math::recover_memory();
    benchmark::ClobberMemory();
  }
 }

 static void exp_loop_dbl(benchmark::State &state) {
  Eigen::Matrix<double, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
  Eigen::Matrix<double, -1, 1> m_res(state.range(0));

  for (auto _ : state) {
    m_res = stan::math::lgamma(m_d);
  }
 }

 static void exp_loop_var(benchmark::State &state) {
  using stan::math::var;

  for (auto _ : state) {
    Eigen::Matrix<var, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
    auto start = std::chrono::high_resolution_clock::now();
    Eigen::Matrix<var, -1, 1> m_res = stan::math::lgamma(m_d);
    // this is just a dumb easy way to calculate grad.
    var sum_toss = stan::math::sum(m_res);
    sum_toss.grad();
    auto end = std::chrono::high_resolution_clock::now();
    auto elapsed_seconds =
      std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
    state.SetIterationTime(elapsed_seconds.count());
    stan::math::recover_memory();
    benchmark::ClobberMemory();
  }
 }

 // Just to fill up the stack allocator
 static void toss_me(benchmark::State &state) {
  using stan::math::var;
  Eigen::Matrix<double, -1, -1> x_vals = Eigen::MatrixXd::Random(9000, 9000);
  Eigen::Matrix<double, -1, -1> y_vals = Eigen::MatrixXd::Random(9000, 9000);
  using stan::math::sum;
  using stan::math::var;
  Eigen::Matrix<var, -1, -1> x = x_vals;
  Eigen::Matrix<var, -1, -1> y = y_vals;
  var lp = 0;
  benchmark::DoNotOptimize(lp.vi_);
  for (auto _ : state) {
    lp.grad();
    stan::math::set_zero_all_adjoints();
  }
  stan::math::recover_memory();
 }

 const size_t start_val = 128;
 const size_t end_val = 128384;
 BENCHMARK(toss_me);
 using stan::math::var;
 BENCHMARK(exp_ParMap_var)->RangeMultiplier(2)->Ranges({{start_val, end_val}, {2, 4096}})->UseManualTime();
 BENCHMARK(exp_ParMap_dbl)->RangeMultiplier(2)->Ranges({{start_val, end_val}, {2, 4096}});
 BENCHMARK(exp_loop_dbl)->RangeMultiplier(2)->Range(start_val, end_val);
 BENCHMARK(exp_loop_var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();

 BENCHMARK_MAIN();
	#include <benchmark/benchmark.h>
	#include <stan/math.hpp>

	static void exp_ParMap_dbl(benchmark::State &state) {
	Eigen::Matrix<double, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
	Eigen::Matrix<double, -1, 1> m_res(state.range(0));
	// Functor defining how inputs should be indexed
	auto ind_f = [](auto&& i, auto&& j, auto&& fun, auto&& x) {
	return fun(x.segment(i, j));
	};

	// Functor defining function to be applied to indexed arguments
	auto f = [](auto&& x) { return stan::math::lgamma(x); };

	for (auto _ : state) {
	auto m_res = stan::math::parallel_map(f, ind_f, state.range(1), m_d);
	}
	}

	static void exp_ParMap_var(benchmark::State &state) {
	using stan::math::var;
	// Functor defining how inputs should be indexed
	auto ind_f = [](auto&& i, auto&& fun, auto&& x) {
	return fun(x.coeffRef(i));
	};

	// Functor defining function to be applied to indexed arguments
	auto f = [](auto&& x) { return stan::math::lgamma(x); };
	for (auto _ : state) {
	Eigen::Matrix<var, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
	auto start = std::chrono::high_resolution_clock::now();
	Eigen::Matrix<var, -1, 1> m_res = stan::math::parallel_map(f, ind_f, state.range(1), m_d);
	var sum_toss = stan::math::sum(m_res);
	sum_toss.grad();
	auto end = std::chrono::high_resolution_clock::now();
	auto elapsed_seconds =
	std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
	state.SetIterationTime(elapsed_seconds.count());
	stan::math::recover_memory();
	benchmark::ClobberMemory();
	}
	}

	static void exp_loop_dbl(benchmark::State &state) {
	Eigen::Matrix<double, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
	Eigen::Matrix<double, -1, 1> m_res(state.range(0));

	for (auto _ : state) {
	m_res = stan::math::lgamma(m_d);
	}
	}

	static void exp_loop_var(benchmark::State &state) {
	using stan::math::var;

	for (auto _ : state) {
	Eigen::Matrix<var, -1, 1> m_d = Eigen::VectorXd::Random(state.range(0));
	auto start = std::chrono::high_resolution_clock::now();
	Eigen::Matrix<var, -1, 1> m_res = stan::math::lgamma(m_d);
	// this is just a dumb easy way to calculate grad.
	var sum_toss = stan::math::sum(m_res);
	sum_toss.grad();
	auto end = std::chrono::high_resolution_clock::now();
	auto elapsed_seconds =
	std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
	state.SetIterationTime(elapsed_seconds.count());
	stan::math::recover_memory();
	benchmark::ClobberMemory();
	}
	}

	// Just to fill up the stack allocator
	static void toss_me(benchmark::State &state) {
	using stan::math::var;
	Eigen::Matrix<double, -1, -1> x_vals = Eigen::MatrixXd::Random(9000, 9000);
	Eigen::Matrix<double, -1, -1> y_vals = Eigen::MatrixXd::Random(9000, 9000);
	using stan::math::sum;
	using stan::math::var;
	Eigen::Matrix<var, -1, -1> x = x_vals;
	Eigen::Matrix<var, -1, -1> y = y_vals;
	var lp = 0;
	benchmark::DoNotOptimize(lp.vi_);
	for (auto _ : state) {
	lp.grad();
	stan::math::set_zero_all_adjoints();
	}
	stan::math::recover_memory();
	}

	const size_t start_val = 128;
	const size_t end_val = 128384;
	BENCHMARK(toss_me);
	using stan::math::var;
	BENCHMARK(exp_ParMap_var)->RangeMultiplier(2)->Ranges({{start_val, end_val}, {2, 4096}})->UseManualTime();
	BENCHMARK(exp_ParMap_dbl)->RangeMultiplier(2)->Ranges({{start_val, end_val}, {2, 4096}});
	BENCHMARK(exp_loop_dbl)->RangeMultiplier(2)->Range(start_val, end_val);
	BENCHMARK(exp_loop_var)->RangeMultiplier(2)->Range(start_val, end_val)->UseManualTime();

	BENCHMARK_MAIN();