c++performance optimization 64-bit double-precision

Why c++ program compiled for x64 platform is slower than compiled for x86?

I've wrote program, and compiled it for x64 and x86 platform in Visual Studio 2010 on Intel Core i5-2500. x64 version take about 19 seconds for execution and x86 take about 17 seconds. What can be the reason of such behavior?

#include "timer.h"

#include <vector>
#include <iostream>
#include <algorithm>
#include <string>
#include <sstream>

/********************DECLARATIONS************************************************/
class Vector
{
public:
    Vector():x(0),y(0),z(0){}

    Vector(double x, double y, double z)
        : x(x)
        , y(y)
        , z(z)
    {
    }

    double x;
    double y;
    double z;
};


double Dot(const Vector& a, const Vector& b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}


class Vector2
{
public:
    typedef double value_type;

    Vector2():x(0),y(0){}

    Vector2(double x, double y)
        : x(x)
        , y(y)
    {
    }

    double x;
    double y;
};

/******************************TESTS***************************************************/

void Test(const std::vector<Vector>& m, std::vector<Vector2>& m2)
{
    Vector axisX(0.3f, 0.001f, 0.25f);
    Vector axisY(0.043f, 0.021f, 0.45f);

    std::vector<Vector2>::iterator i2 = m2.begin();

    std::for_each(m.begin(), m.end(),
        [&](const Vector& v)
    {
        Vector2 r(0,0);
        r.x = Dot(axisX, v);
        r.y = Dot(axisY, v);

        (*i2) = r;
        ++i2;
    });
}


int main()
{
    cpptask::Timer timer;

    int len2 = 300;
    size_t len = 5000000;
    std::vector<Vector> m;
    m.reserve(len);
    for (size_t i = 0; i < len; ++i)
    {
        m.push_back(Vector(i * 0.2345, i * 2.67, i * 0.98));
    }

    /***********************************************************************************/
    {
        std::vector<Vector2> m2(m.size());
        double time = 0;
        for (int i = 0; i < len2; ++i)
        {
            timer.Start();
            Test(m, m2);
            time += timer.End();
        }
        std::cout << "Dot product double - " << time / len2 << std::endl;
    }
    /***********************************************************************************/


    return 0;
}

Solution

Short Answer: It's a compiler hiccup. x64 optimizer fail.

Long Answer:

This x86 version is very slow if SSE2 is disabled. But I'm able to reproduce the results with SSE2 enabled in x86.

If you dive into the assembly of that inner-most loop. The x64 version has two extra memory copies at the end.

x86:

$LL71@main:
movsd   xmm2, QWORD PTR [eax-8]
movsd   xmm0, QWORD PTR [eax-16]
movsd   xmm3, QWORD PTR [eax]
movapd  xmm1, xmm0
mulsd   xmm0, QWORD PTR __real@3fa60418a0000000
movapd  xmm7, xmm2
mulsd   xmm2, QWORD PTR __real@3f95810620000000
mulsd   xmm7, xmm5
mulsd   xmm1, xmm4
addsd   xmm1, xmm7
movapd  xmm7, xmm3
mulsd   xmm3, QWORD PTR __real@3fdcccccc0000000
mulsd   xmm7, xmm6
add eax, 24                 ; 00000018H
addsd   xmm1, xmm7
addsd   xmm0, xmm2
movq    QWORD PTR [ecx], xmm1
addsd   xmm0, xmm3
movq    QWORD PTR [ecx+8], xmm0
lea edx, DWORD PTR [eax-16]
add ecx, 16                 ; 00000010H
cmp edx, esi
jne SHORT $LL71@main

x64:

$LL175@main:
movsdx  xmm3, QWORD PTR [rdx-8]
movsdx  xmm5, QWORD PTR [rdx-16]
movsdx  xmm4, QWORD PTR [rdx]
movapd  xmm2, xmm3
mulsd   xmm2, xmm6
movapd  xmm0, xmm5
mulsd   xmm0, xmm7
addsd   xmm2, xmm0
movapd  xmm1, xmm4
mulsd   xmm1, xmm8
addsd   xmm2, xmm1
movsdx  QWORD PTR r$109492[rsp], xmm2
mulsd   xmm5, xmm9
mulsd   xmm3, xmm10
addsd   xmm5, xmm3
mulsd   xmm4, xmm11
addsd   xmm5, xmm4
movsdx  QWORD PTR r$109492[rsp+8], xmm5
mov rcx, QWORD PTR r$109492[rsp]
mov QWORD PTR [rax], rcx
mov rcx, QWORD PTR r$109492[rsp+8]
mov QWORD PTR [rax+8], rcx
add rax, 16
add rdx, 24
lea rcx, QWORD PTR [rdx-16]
cmp rcx, rbx
jne SHORT $LL175@main

The x64 version has a lot more (unexplained) moves at the end of the loop. It looks like some sort of memory-to-memory data-copy.

EDIT:

It turns out that the x64 optimizer isn't able to optimize out the following copy:

(*i2) = r;

This is why the inner loop has two extra memory copies. If you change the loop to this:

std::for_each(m.begin(), m.end(),
    [&](const Vector& v)
{
    i2->x = Dot(axisX, v);
    i2->y = Dot(axisY, v);
    ++i2;
});

This eliminates the copies. Now the x64 version is just as fast as the x86 version:

x86: 0.0249423
x64: 0.0249348

Lesson Learned: Compilers aren't perfect.