As the title says, I've got a custom UnaryView function that is getting called multiple times in different operations (mostly multiplication with other matrices). For example:
MatrixXd out = mat1.cview() * mat2;
MatrixXd out2 = mat1.cview() * mat3.transpose();
Would it be faster to first copy the custom view into a separate matrix and use that instead? For example:
MatrixXd mat1_dbl = mat1.cview();
MatrixXd out = mat1_dbl * mat2;
MatrixXd out2 = mat1_dbl * mat3.transpose();
Basically, is repeatedly using the UnaryView slower than copying to a matrix and using that instead?
Should have done my own benchmarking. Google benchmark shows that it is markedly faster to copy first:
2019-04-09 20:55:55
Running ./CView
Run on (16 X 4053.06 MHz CPU s)
CPU Caches:
L1 Data 32K (x8)
L1 Instruction 64K (x8)
L2 Unified 512K (x8)
L3 Unified 8192K (x2)
--------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------
UnaryView_Repeat 147390919 ns 147385796 ns 5
UnaryView_Copy 139456051 ns 139451409 ns 5
Tested with:
#include <stan/math/prim/mat/fun/Eigen.hpp>
#include <stan/math/fwd/mat.hpp>
#include <stan/math/fwd/core.hpp>
#include <benchmark/benchmark.h>
static void UnaryView_Repeat(benchmark::State& state) {
using Eigen::MatrixXd;
using stan::math::matrix_fd;
matrix_fd m_fd1(1000, 1000);
m_fd1.val_() = MatrixXd::Random(1000, 1000);
m_fd1.d_() = MatrixXd::Random(1000, 1000);
MatrixXd m_d2 = MatrixXd::Random(1000, 1000);
for (auto _ : state) {
MatrixXd out(1000,1000);
out = m_fd1.val_() * m_d2
+ m_fd1.val_().transpose() * m_d2
+ m_fd1.val_().array().exp().matrix();
}
}
BENCHMARK(UnaryView_Repeat);
static void UnaryView_Copy(benchmark::State& state) {
using Eigen::MatrixXd;
using stan::math::matrix_fd;
matrix_fd m_fd1(1000, 1000);
m_fd1.val_() = MatrixXd::Random(1000, 1000);
m_fd1.d_() = MatrixXd::Random(1000, 1000);
MatrixXd m_d2 = MatrixXd::Random(1000, 1000);
for (auto _ : state) {
MatrixXd out(1000,1000);
MatrixXd m_fd1_val = m_fd1.val_();
out = m_fd1_val * m_d2 + m_fd1_val.transpose() * m_d2
+ m_fd1_val.array().exp().matrix();
}
}
BENCHMARK(UnaryView_Copy);
BENCHMARK_MAIN();