Search code examples
c++performancememoryallocationcopying

In what cases should I use memcpy over standard operators in C++?


When can I get better performance using memcpy or how do I benefit from using it? For example:

float a[3]; float b[3];

is code:

memcpy(a, b, 3*sizeof(float));

faster than this one?

a[0] = b[0];
a[1] = b[1];
a[2] = b[2];

Solution

  • Efficiency should not be your concern.
    Write clean maintainable code.

    It bothers me that so many answers indicate that the memcpy() is inefficient. It is designed to be the most efficient way of copy blocks of memory (for C programs).

    So I wrote the following as a test:

    #include <algorithm>
    
    extern float a[3];
    extern float b[3];
    extern void base();
    
    int main()
    {
        base();
    
    #if defined(M1)
        a[0] = b[0];
        a[1] = b[1];
        a[2] = b[2];
    #elif defined(M2)
        memcpy(a, b, 3*sizeof(float));    
    #elif defined(M3)
        std::copy(&a[0], &a[3], &b[0]);
     #endif
    
        base();
    }
    

    Then to compare the code produces:

    g++ -O3 -S xr.cpp -o s0.s
    g++ -O3 -S xr.cpp -o s1.s -DM1
    g++ -O3 -S xr.cpp -o s2.s -DM2
    g++ -O3 -S xr.cpp -o s3.s -DM3
    
    echo "=======" >  D
    diff s0.s s1.s >> D
    echo "=======" >> D
    diff s0.s s2.s >> D
    echo "=======" >> D
    diff s0.s s3.s >> D
    

    This resulted in: (comments added by hand)

    =======   // Copy by hand
    10a11,18
    >   movq    _a@GOTPCREL(%rip), %rcx
    >   movq    _b@GOTPCREL(%rip), %rdx
    >   movl    (%rdx), %eax
    >   movl    %eax, (%rcx)
    >   movl    4(%rdx), %eax
    >   movl    %eax, 4(%rcx)
    >   movl    8(%rdx), %eax
    >   movl    %eax, 8(%rcx)
    
    =======    // memcpy()
    10a11,16
    >   movq    _a@GOTPCREL(%rip), %rcx
    >   movq    _b@GOTPCREL(%rip), %rdx
    >   movq    (%rdx), %rax
    >   movq    %rax, (%rcx)
    >   movl    8(%rdx), %eax
    >   movl    %eax, 8(%rcx)
    
    =======    // std::copy()
    10a11,14
    >   movq    _a@GOTPCREL(%rip), %rsi
    >   movl    $12, %edx
    >   movq    _b@GOTPCREL(%rip), %rdi
    >   call    _memmove
    

    Added Timing results for running the above inside a loop of 1000000000.

       g++ -c -O3 -DM1 X.cpp
       g++ -O3 X.o base.o -o m1
       g++ -c -O3 -DM2 X.cpp
       g++ -O3 X.o base.o -o m2
       g++ -c -O3 -DM3 X.cpp
       g++ -O3 X.o base.o -o m3
       time ./m1
    
       real 0m2.486s
       user 0m2.478s
       sys  0m0.005s
       time ./m2
    
       real 0m1.859s
       user 0m1.853s
       sys  0m0.004s
       time ./m3
    
       real 0m1.858s
       user 0m1.851s
       sys  0m0.006s