I decided to check how data arrangement in matrix influences performance of simple operations.I wrote simple row summation algorithm using Eigen::Matrix
as data storage.I though that RowMajor storage should demostrate better performance due to better cache utilization.
I used g++
compiler with -O2
option and it gives me following results:
ColMajor: 40791546 µs
RowMajor: 28790948 µs
It's pretty ok. But with -O3
it gives me really strange difference:
ColMajor: 10353619 µs
RowMajor: 28359348 µs
And it looks like ColMajor becomes really fast with -O3
. Why switching from -O2
to -O3
changes performance so drastically?
My CPU: intel i7-6700K, gcc version: 7.5.0-3ubuntu1~19.10
My "benchmark":
#include <iostream>#include <vector>#include <chrono>#include "Eigen/Core"template<typename DerivedMat, typename DerivedRes>void runTest(const Eigen::MatrixBase<DerivedMat> &mat, Eigen::MatrixBase<DerivedRes> &res) { const int64_t nRows = mat.rows(); const int64_t nCols = mat.cols(); for(int64_t row = 0; row < nRows; ++row){ for(int64_t col = 0; col < nCols; ++col){ res(row, 0) += mat(row, col); } }}const int64_t nRows = 300;const int64_t nCols = 5000;const int nAttempts = 20000;template<int Alignment>void bench() { Eigen::Matrix<float, -1, -1, Alignment> mat(nRows, nCols); srand(42); mat.setRandom(); Eigen::VectorXf res(nRows); res.setZero(); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); for(int iter = 0; iter < nAttempts; ++iter) runTest(mat, res); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::cout << "Elapsed "<< std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[µs]"<< std::endl;}int main() { bench<Eigen::ColMajor>(); //bench<Eigen::RowMajor>(); return 0;}