Search code examples
c++performanceoptimization64-bitdouble-precision

Why c++ program compiled for x64 platform is slower than compiled for x86?


I've wrote program, and compiled it for x64 and x86 platform in Visual Studio 2010 on Intel Core i5-2500. x64 version take about 19 seconds for execution and x86 take about 17 seconds. What can be the reason of such behavior?

#include "timer.h"

#include <vector>
#include <iostream>
#include <algorithm>
#include <string>
#include <sstream>

/********************DECLARATIONS************************************************/
class Vector
{
public:
    Vector():x(0),y(0),z(0){}

    Vector(double x, double y, double z)
        : x(x)
        , y(y)
        , z(z)
    {
    }

    double x;
    double y;
    double z;
};


double Dot(const Vector& a, const Vector& b)
{
    return a.x * b.x + a.y * b.y + a.z * b.z;
}


class Vector2
{
public:
    typedef double value_type;

    Vector2():x(0),y(0){}

    Vector2(double x, double y)
        : x(x)
        , y(y)
    {
    }

    double x;
    double y;
};

/******************************TESTS***************************************************/

void Test(const std::vector<Vector>& m, std::vector<Vector2>& m2)
{
    Vector axisX(0.3f, 0.001f, 0.25f);
    Vector axisY(0.043f, 0.021f, 0.45f);

    std::vector<Vector2>::iterator i2 = m2.begin();

    std::for_each(m.begin(), m.end(),
        [&](const Vector& v)
    {
        Vector2 r(0,0);
        r.x = Dot(axisX, v);
        r.y = Dot(axisY, v);

        (*i2) = r;
        ++i2;
    });
}


int main()
{
    cpptask::Timer timer;

    int len2 = 300;
    size_t len = 5000000;
    std::vector<Vector> m;
    m.reserve(len);
    for (size_t i = 0; i < len; ++i)
    {
        m.push_back(Vector(i * 0.2345, i * 2.67, i * 0.98));
    }

    /***********************************************************************************/
    {
        std::vector<Vector2> m2(m.size());
        double time = 0;
        for (int i = 0; i < len2; ++i)
        {
            timer.Start();
            Test(m, m2);
            time += timer.End();
        }
        std::cout << "Dot product double - " << time / len2 << std::endl;
    }
    /***********************************************************************************/


    return 0;
}

Solution

  • Short Answer: It's a compiler hiccup. x64 optimizer fail.


    Long Answer:

    This x86 version is very slow if SSE2 is disabled. But I'm able to reproduce the results with SSE2 enabled in x86.

    If you dive into the assembly of that inner-most loop. The x64 version has two extra memory copies at the end.

    x86:

    $LL71@main:
    movsd   xmm2, QWORD PTR [eax-8]
    movsd   xmm0, QWORD PTR [eax-16]
    movsd   xmm3, QWORD PTR [eax]
    movapd  xmm1, xmm0
    mulsd   xmm0, QWORD PTR __real@3fa60418a0000000
    movapd  xmm7, xmm2
    mulsd   xmm2, QWORD PTR __real@3f95810620000000
    mulsd   xmm7, xmm5
    mulsd   xmm1, xmm4
    addsd   xmm1, xmm7
    movapd  xmm7, xmm3
    mulsd   xmm3, QWORD PTR __real@3fdcccccc0000000
    mulsd   xmm7, xmm6
    add eax, 24                 ; 00000018H
    addsd   xmm1, xmm7
    addsd   xmm0, xmm2
    movq    QWORD PTR [ecx], xmm1
    addsd   xmm0, xmm3
    movq    QWORD PTR [ecx+8], xmm0
    lea edx, DWORD PTR [eax-16]
    add ecx, 16                 ; 00000010H
    cmp edx, esi
    jne SHORT $LL71@main
    

    x64:

    $LL175@main:
    movsdx  xmm3, QWORD PTR [rdx-8]
    movsdx  xmm5, QWORD PTR [rdx-16]
    movsdx  xmm4, QWORD PTR [rdx]
    movapd  xmm2, xmm3
    mulsd   xmm2, xmm6
    movapd  xmm0, xmm5
    mulsd   xmm0, xmm7
    addsd   xmm2, xmm0
    movapd  xmm1, xmm4
    mulsd   xmm1, xmm8
    addsd   xmm2, xmm1
    movsdx  QWORD PTR r$109492[rsp], xmm2
    mulsd   xmm5, xmm9
    mulsd   xmm3, xmm10
    addsd   xmm5, xmm3
    mulsd   xmm4, xmm11
    addsd   xmm5, xmm4
    movsdx  QWORD PTR r$109492[rsp+8], xmm5
    mov rcx, QWORD PTR r$109492[rsp]
    mov QWORD PTR [rax], rcx
    mov rcx, QWORD PTR r$109492[rsp+8]
    mov QWORD PTR [rax+8], rcx
    add rax, 16
    add rdx, 24
    lea rcx, QWORD PTR [rdx-16]
    cmp rcx, rbx
    jne SHORT $LL175@main
    

    The x64 version has a lot more (unexplained) moves at the end of the loop. It looks like some sort of memory-to-memory data-copy.

    EDIT:

    It turns out that the x64 optimizer isn't able to optimize out the following copy:

    (*i2) = r;
    

    This is why the inner loop has two extra memory copies. If you change the loop to this:

    std::for_each(m.begin(), m.end(),
        [&](const Vector& v)
    {
        i2->x = Dot(axisX, v);
        i2->y = Dot(axisY, v);
        ++i2;
    });
    

    This eliminates the copies. Now the x64 version is just as fast as the x86 version:

    x86: 0.0249423
    x64: 0.0249348
    

    Lesson Learned: Compilers aren't perfect.