why std::vector::push_back much slower than a manual implementation?

I wanted to write a dynamic array for trivial type (then I can use memcpy or sth to optimize), but when I compare its efficiency to std::vector, I found its push_back function is twice as efficient as astd::vector.

That's so strange, I read source code of MSVC STL to find why, but in vain.

my code:


template<typename T>
class Array {
    static_assert((std::is_trivial_v<T>), "Array requires type to be trivial.");
    T* m_p;
    size_t m_cap, m_siz;
private:
    void increase(size_t new_cap) {
        ASSERT(new_cap > m_cap);
        if (new_cap < m_cap + m_cap / 2)new_cap = m_cap + m_cap / 2;
        T* new_p = (T*)realloc(m_p, sizeof(T) * new_cap);
        ASSERT(new_p);
        m_p = new_p;
        m_cap = new_cap;
    }
public:
    Array() : m_siz(0), m_cap(0), m_p(nullptr) {}
    ~Array() { if (m_p)free(m_p); }
    Array(const Array& x) {
        m_cap = m_siz = x.m_siz;
        m_p = (T*)malloc(sizeof(T) * m_siz);
        memcpy(m_p, x.m_p, sizeof(T) * m_siz);
    }
    Array(Array&& x) {
        m_cap = x.m_cap;
        m_siz = x.m_siz;
        m_p = x.m_p;
        x.m_p = nullptr;
        x.m_cap = x.m_siz = 0;
    }
    Array& operator=(const Array& x) {
        m_cap = m_siz = x.m_siz;
        m_p = (T*)malloc(sizeof(T) * m_siz);
        memcpy(m_p, x.m_p, sizeof(T) * m_siz);
    }
    Array& operator=(Array&& x) {
        m_cap = x.m_cap;
        m_siz = x.m_siz;
        m_p = x.m_p;
        x.m_p = nullptr;
        x.m_cap = x.m_siz = 0;
    }
    void push_back(const T& x) {
        if (m_siz == m_cap)increase(m_cap + 1);
        m_p[m_siz++] = x;
    }
    void reserve(size_t cap) {
        if (cap > m_cap)
            increase(cap);
    }
    void resize(size_t siz, const T& val = T{}) {
        if (siz > m_siz) {
            if (siz > m_cap)increase(siz);
            for (size_t i = m_siz; i < siz; i++)
                m_p[i] = val;
        }
        m_siz = siz;
    }
    void shrink_to_fit() {
        m_p = realloc(m_p, m_siz * sizeof(T));
        m_cap = m_siz;
    }
    T& operator[](size_t pos) { ASSERT(pos < m_siz); return m_p[pos]; }
    const T& operator[](size_t pos) const { ASSERT(pos < m_siz); return m_p[pos]; }
    size_t size()const { return m_siz; }
    size_t capacity()const { return m_cap; }
    void clear() { m_siz = 0; }
};

#define N 10000
#define M 100000
template<typename T>
int test() {
    int t = clock();
    T v;
    v.reserve(M);
    for (int t = 0; t < N; t++) {
        for (int i = 0; i < M; i++)
            v.push_back(i);
        //v.resize(M);

        //for (int i = 0; i < M; i++)
            //v[i] = -i;
        v.clear();
    }
    return clock() - t;
}
int main() {
    int t1 = test<std::vector<int>>();
    int t2 = test<Array<int>>();

    printf("vector:%dms.\nArray:%dms.\n", t1, t2);
}

result(Release in VS2019):

vector:1043ms.
Array:547ms.

This is the invoke chain of std::vector::pushback(MSVC):

    void push_back(const _Ty& _Val) { // insert by moving into element at end, provide strong guarantee
        emplace_back(_STD move(_Val));
    }

    template <class... _Valty>
    decltype(auto) emplace_back(_Valty&&... _Val) {
        // insert by perfectly forwarding into element at end, provide strong guarantee
        auto& _My_data   = _Mypair._Myval2;
        pointer& _Mylast = _My_data._Mylast;
        if (_Mylast != _My_data._Myend) {
            return _Emplace_back_with_unused_capacity(_STD forward<_Valty>(_Val)...);
        }

        _Ty& _Result = *_Emplace_reallocate(_Mylast, _STD forward<_Valty>(_Val)...);
#if _HAS_CXX17
        return _Result;
#else // ^^^ _HAS_CXX17 ^^^ // vvv !_HAS_CXX17 vvv
        (void) _Result;
#endif // _HAS_CXX17
    }


    template <class... _Valty>
    decltype(auto) _Emplace_back_with_unused_capacity(_Valty&&... _Val) {
        // insert by perfectly forwarding into element at end, provide strong guarantee
        auto& _My_data   = _Mypair._Myval2;
        pointer& _Mylast = _My_data._Mylast;
        _STL_INTERNAL_CHECK(_Mylast != _My_data._Myend); // check that we have unused capacity
        _Alty_traits::construct(_Getal(), _Unfancy(_Mylast), _STD forward<_Valty>(_Val)...);
        _Orphan_range(_Mylast, _Mylast);
        _Ty& _Result = *_Mylast;
        ++_Mylast;
#if _HAS_CXX17
        return _Result;
#else // ^^^ _HAS_CXX17 ^^^ // vvv !_HAS_CXX17 vvv
        (void) _Result;
#endif // _HAS_CXX17
    }

After inline and other optimizations, the STL code doesn't seem to make much difference from mine.

Could anyone tell me why is my push_back is more efficient(or why STL is slow)?

EDIT: Sorry, my question seemd to caused a misunderstanding.

I know that std::vector is general, so it might do more work than mine.

But in a push_back of type int , I don't think std::vector did something special, so why it is slow?

Solution

The difference is mainly due to less favorable codegen in the std::vector version. We can see it if we compare the generated assembly (godbolt link).

Your loop (skipping the reallocation part):

$LL4@test:
        xor     esi, esi
        xor     edi, edi
$LL7@test:
        mov     r15, rbx
        cmp     rdi, rbx
        jne     SHORT $LN27@test
        <...skip...>
$LN27@test:
        mov     DWORD PTR [r14+rdi*4], esi
        inc     rdi
        inc     esi
        cmp     esi, 100000                         ; 000186a0H
        jl      $LL7@test
        sub     r12, r13
        jne     $LL4@test

std::vector::push_back loop (again, skipping the reallocation part):

$LL4@test:
        xor     ebx, ebx
        mov     DWORD PTR i$1[rsp], ebx
$LL7@test:
        cmp     rcx, QWORD PTR v$[rsp+16]
        je      SHORT $LN26@test
        mov     DWORD PTR [rcx], ebx
        mov     rcx, QWORD PTR v$[rsp+8]
        add     rcx, 4
        mov     QWORD PTR v$[rsp+8], rcx
        jmp     SHORT $LN5@test
$LN26@test:
        <...skip...>
$LN5@test:
        inc     ebx
        mov     DWORD PTR i$1[rsp], ebx
        cmp     ebx, 100000                         ; 000186a0H
        jl      SHORT $LL7@test
        mov     rcx, QWORD PTR v$[rsp]
        mov     QWORD PTR v$[rsp+8], rcx
        sub     rdi, 1
        jne     SHORT $LL4@test

Clearly we can see more code (11 vs. 8 instructions in the hot path) and more indirection to memory (5 vs. 1). So it's no surprise that it's slower.

More generally, more complex code == slower code.

Could the two versions be optimized identically? I see no reason why not. MSVC 19.28 just can't do it.