I have below Eigen C++ code and doing squredNorm calculations 10milliyon times.
Is there anyway to make it more robust/faster .
#include <Eigen/Core>
#include <tbb/parallel_for.h>
#include "tbb/tbb.h"
#include <mutex>
#include <opencv2/opencv.hpp>
int main(){
int numberOFdata = 10000008;
Eigen::MatrixXf feat = Eigen::MatrixXf::Random(numberOFdata,512);
Eigen::MatrixXf b_cmp= Eigen::MatrixXf::Random(1,512);
int count_feature = feat.rows();
std::vector<int> found_number ;
std::mutex mutex1;
for (int loop = 0 ; loop<16 ; loop++){
double start_1 = static_cast<double>(cv::getTickCount());
tbb::affinity_partitioner ap;
tbb::parallel_for( tbb::blocked_range<int>(0,count_feature),
[&](tbb::blocked_range<int> r )
{
for (int i=r.begin(); i<r.end(); ++i)
{
auto distance = ( feat.row(i)- b_cmp ).squaredNorm();
if (distance < 0.5) {
mutex1.lock();
found_number.push_back(i);
mutex1.unlock();
}
}
},ap);
double timefin = ((double)cv::getTickCount() - start_1) / cv::getTickFrequency();
std::cout << count_feature << " TOTAL : " << timefin << std::endl;
}
}
Compile flags :
-Xpreprocessor -std=c++11 -fopenmp -pthread -O3 -mavx2 -march=native -funroll-loops -fpermissive
eigen version 3.3.7 tbb opencv and eigen linked.
You can remove opencv and use a different elapsed time calculation.
Thanks
You should be faster by a factor of about 4 if you store feat
in the same order in which you access it (i.e., Eigen::RowMajor
in your case).
Minimal example removing all non-Eigen related things:
int numberOFdata = 10000008;
Eigen::Matrix<float,Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> feat = Eigen::MatrixXf::Random(numberOFdata, 512);
Eigen::RowVectorXf b_cmp = Eigen::MatrixXf::Random(1, 512);
int count_feature = feat.rows();
std::vector<int> found_number;
for (int loop = 0; loop < 16; loop++) {
auto start = std::chrono::steady_clock::now();
{
for (int i = 0; i < feat.rows(); ++i) {
float distance = (feat.row(i) - b_cmp).squaredNorm();
if (distance < 0.5f) {
found_number.push_back(i);
}
}
};
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << count_feature << " TOTAL : " <<
diff.count() << std::endl;
}
Godbolt-Demo (reduced dimension of feat
due to memory-limitations): https://godbolt.org/z/b6r5K4Yxv