I'm currently computing a small quantity for each value of a big matrix (millions of rows, number of columns < 1000) while considering each row independently.
More precisely, for each value M(i,j) in each row i, column j of this matrix, the quantity is simply [ M(i,j) - mean(i,s) ] / std(i,s) where s is the subset s in M(i,:) - j in other words, s is the subset of all values of row i without value j.
I compared two implementations, one in C-style array and one in Armadillo, and Armadillo is roughly twice slower in termes of execution time. I would expect a similar or slighty slower execution time, but plain C arrays seem to dramatically improve the performance.
Is there any particular reason or somthing that I missed somewhere? Here is a example compiled with: -O2 -lstdc++ -DARMA_DONT_USE_WRAPPER -lopenblas -llapack -lm
. Also tried to use ARMA_NO_DEBUG
without success.
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <armadillo>
#include <chrono>
using namespace std::chrono;
/***************************
* main()
***************************/
int main( int argc, char *argv[] )
{
unsigned nrows = 2000000; //number of rows
unsigned ncols = 100; //number of cols
const arma::mat huge_mat = arma::randn(nrows, ncols); //create huge matrix
const arma::uvec vec = arma::linspace<arma::uvec>( 0, huge_mat.n_cols-1, huge_mat.n_cols); //create a vector of [0,...,n]
arma::rowvec inds = arma::zeros<arma::rowvec>( huge_mat.n_cols-1 ); //-1 since we remove only one value at each step.
arma::colvec simuT = arma::zeros<arma::colvec>( ncols ); //let's store the results in this simuT vector.
high_resolution_clock::time_point t1 = high_resolution_clock::now();
//compute some normalization over each value of line of this huge matrix:
for(unsigned i=0; i < nrows; i++) {
const arma::rowvec current_line = huge_mat.row(i); //extract current line
//for each observation in current_line:
for(unsigned j=0; j < ncols; j++) {
//Take care of side effects first:
if( j == 0 )
inds = current_line(arma::span(1, ncols-1));
else
if( j == 1 ) {
inds(0) = current_line(0);
inds(arma::span(1, ncols-2)) = current_line( arma::span(2, ncols-1) );
} else
inds(arma::span(0, j-1)) = current_line( arma::span(0, j-1) );
//Let's do some computation: huge_mat(i,j) - mean[huge_mat(i,:)] / std([huge_mat(i,:)]) //can compute the mean and std first... for each line.
simuT(j) = (current_line(j) - arma::mean(inds)) / ( std::sqrt( 1+1/((double) ncols-1) ) * arma::stddev(inds) );
}
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<seconds>( t2 - t1 ).count();
std::cout << "ARMADILLO: " << duration << " secs\n";
//------------------PLAIN C Array
double *Mat_full;
double *output;
unsigned int i,j,k;
double mean=0, stdd=0;
double sq_diff_sum = 0, sum=0;
double diff = 0;
Mat_full = (double *) malloc(ncols * nrows * sizeof(double));
output = (double *) malloc(nrows * ncols * sizeof(double));
std::vector< std::vector<double> > V(huge_mat.n_rows);
//Some UGLY copy from arma::mat to double* using a vector:
for (size_t i = 0; i < huge_mat.n_rows; ++i)
V[i] = arma::conv_to< std::vector<double> >::from(huge_mat.row(i));
//then dump to Mat_full array:
for (i=0; i < V.size(); i++)
for (j=0; j < V[i].size(); j++)
Mat_full[i + huge_mat.n_rows * j] = V[i][j];
t1 = high_resolution_clock::now();
for(i=0; i < nrows; i++)
for(j=0; j < ncols; j++)
{
//compute mean of subset-------------------
sum = 0;
for(k = 0; k < ncols; k++)
if(k!=j)
{
sum = sum + Mat_full[i+k*nrows];
}
mean = sum / (ncols-1);
//compute standard deviation of subset-----
sq_diff_sum = 0;
for(k = 0; k < ncols; k++)
if(k!=j)
{
diff = Mat_full[i+k*nrows] - mean;
sq_diff_sum += diff * diff;
}
stdd = sqrt(sq_diff_sum / (ncols-2));
//export to plain C array:
output[i*ncols+j] = (Mat_full[i+j*nrows] - mean) / (sqrt(1+1/(((double) ncols)-1))*stdd);
}
t2 = high_resolution_clock::now();
duration = duration_cast<seconds>( t2 - t1 ).count();
std::cout << "C ARRAY: " << duration << " secs\n";
}
In particular the calls to arma::mean and arma::stddev seem to perform poorly when comparing execution times. I did not perform any in-depth analyse of the size-effect over performance, but it seems that for small values of nrows
the plain C tends to be (very much) faster. For a simple test using this
setup i got:
ARMADILLO: 111 secs
C ARRAY: 79 secs
in execution time.
EDIT
Here is modification where we work column-wise instead of row-wise and treat each column independently, as suggested by @rubenvb and @mtall. The resulting execution time slightly is decreased (ARMADILLO: 104 secs
now), thus showing some improvments over working row-wise:
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <armadillo>
#include <chrono>
using namespace std::chrono;
/***************************
* main()
***************************/
int main( int argc, char *argv[] )
{
unsigned nrows = 100; //number of rows
unsigned ncols = 2000000; //number of cols
const arma::mat huge_mat = arma::randn(nrows, ncols); //create huge matrix
const arma::uvec vec = arma::linspace<arma::uvec>( 0, huge_mat.n_rows-1, huge_mat.n_rows); //create a vector of [0,...,n]
arma::colvec inds = arma::zeros<arma::colvec>( huge_mat.n_rows-1 ); //-1 since we remove only one value at each step.
arma::rowvec simuT = arma::zeros<arma::rowvec>( nrows ); //let's store the results in this simuT vector.
high_resolution_clock::time_point t1 = high_resolution_clock::now();
//compute some normalization over each value of line of this huge matrix:
for(unsigned i=0; i < ncols; i++) {
const arma::colvec current_line = huge_mat.col(i); //extract current line
//for each observation in current_line:
for(unsigned j=0; j < nrows; j++) {
//Take care of side effects first:
if( j == 0 )
inds = current_line(arma::span(1, nrows-1));
else
if( j == 1 ) {
inds(0) = current_line(0);
inds(arma::span(1, nrows-2)) = current_line( arma::span(2, nrows-1) );
} else
inds(arma::span(0, j-1)) = current_line( arma::span(0, j-1) );
//Let's do some computation: huge_mat(i,j) - mean[huge_mat(i,:)] / std([huge_mat(i,:)]) //can compute the mean and std first... for each line.
simuT(j) = (current_line(j) - arma::mean(inds)) / ( std::sqrt( 1+1/((double) nrows-1) ) * arma::stddev(inds) );
}
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration = duration_cast<seconds>( t2 - t1 ).count();
std::cout << "ARMADILLO: " << duration << " secs\n";
}
The reason is that Armadillo uses column-major ordering in mat, while your C array uses row-major ordering. This is kind of a big deal because your processor can use instruction vectorization to process multiple elements at once, where this requires contiguous memory chunks.
To verify whether this is the cause, do the same calculation but for columns instead of rows, and check the difference.