I was curious and did a little benchmark to determine the performance delta between primitive types such as int
or float
and user types.
I created a template class Var
, created some inline arithmetic operators. The test consisted of looping this loop for both the primitive and Var
vectors:
for (unsigned i = 0; i < 1000; ++i) {
in1[i] = i;
in2[i] = -i;
out[i] = (i % 2) ? in1[i] + in2[i] : in2[i] - in1[i];
}
I was quite surprised with the results, turns out my Var
class is faster most of the time, with int on average that loop took about 5700 nsec less with the class. Out of 3000 runs, int was faster 11 times vs. Var
which was faster 2989 times. Similar results with float
, where Var
is 15100 nsec faster than floatin 2991 of the runs.
Shouldn't primitive types be faster?
Edit: Compiler is a rather ancient mingw 4.4.0, build options are the defaults of QtCreator, no optimizations:
qmake call: qmake.exe C:\...\untitled15.pro -r -spec win32-g++ "CONFIG+=release"
OK, posting full source, platform is 64 bit Win7, 4 GB DDR2-800, Core2Duo@3Ghz
#include <QTextStream>
#include <QVector>
#include <QElapsedTimer>
template<typename T>
class Var{
public:
Var() {}
Var(T val) : var(val) {}
inline T operator+(Var& other)
{
return var + other.value();
}
inline T operator-(Var& other)
{
return var - other.value();
}
inline T operator+(T& other)
{
return var + other;
}
inline T operator-(T& other)
{
return var - other;
}
inline void operator=(T& other)
{
var = other;
}
inline T& value()
{
return var;
}
private:
T var;
};
int main()
{
QTextStream cout(stdout);
QElapsedTimer timer;
unsigned count = 1000000;
QVector<double> pin1(count), pin2(count), pout(count);
QVector<Var<double> > vin1(count), vin2(count), vout(count);
unsigned t1, t2, pAcc = 0, vAcc = 0, repeat = 10, pcount = 0, vcount = 0, ecount = 0;
for (int cc = 0; cc < 5; ++cc)
{
for (unsigned c = 0; c < repeat; ++c)
{
timer.restart();
for (unsigned i = 0; i < count; ++i)
{
pin1[i] = i;
pin2[i] = -i;
pout[i] = (i % 2) ? pin1[i] + pin2[i] : pin2[i] - pin1[i];
}
t1 = timer.nsecsElapsed();
cout << t1 << endl;
timer.restart();
for (unsigned i = 0; i < count; ++i)
{
vin1[i] = i;
vin2[i] = -i;
vout[i] = (i % 2) ? vin1[i] + vin2[i] : vin2[i] - vin1[i];
}
t2 = timer.nsecsElapsed();
cout << t2 << endl;;
pAcc += t1;
vAcc += t2;
}
pAcc /= repeat;
vAcc /= repeat;
if (pAcc < vAcc) {
cout << "primitive was faster" << endl;
pcount++;
}
else if (pAcc > vAcc) {
cout << "var was faster" << endl;
vcount++;
}
else {
cout << "amazingly, both are equally fast" << endl;
ecount++;
}
cout << "Average for primitive type is " << pAcc << ", average for Var is " << vAcc << endl;
}
cout << "int was faster " << pcount << " times, var was faster " << vcount << " times, equal " << ecount << " times, " << pcount + vcount + ecount << " times ran total" << endl;
}
Relatively, with floats the Var class is 6-7% faster than floats, with ints about 3%.
I also ran the test with vector length of 10 000 000 instead of the original 1000 and results are still consistent and in favor of the class.
With QVector
replaced by std::vector
, at -O2
optimization level, code generated by GCC for the two types is exactly the same, instruction for instruction.
Without the replacement, the generated code is different, but that's hardly surprising, considering that QtVector
is implemented differently for primitive and non-primitive types (look for QTypeInfo<T>::isComplex
in qvector.h
).
Update It looks like isComplex
does not affect the linner oop, i.e. the measured part. The loop code still differs for the two types, albeit very slightly. It looks like the difference is due to GCC.