Search code examples
c++performancestldeque

C++ large deque - program takes very long time to exit?


Consider the following C++ program:

#include <deque>
#include <iostream>

using namespace std;

int main()
{
    deque<double> d(30000000);
    cout << "Done\n";
}

The memory allocation in the first line only takes a second, but after it prints Done, it takes 33 seconds (!) to exit back to the terminal. Decreasing the number of elements to 20000000 reduces that time to 22 seconds, so clearly it's linear in the number of elements.

I am compiling on Windows 10, and the same thing happened with both GCC 10.2.0 and Visual Studio 2019.

What's going on here? Am I using deque in a way it's not supposed to be used?

EDIT:

#include <deque>
#include <iostream>

using namespace std;

void test_deque()
{
    deque<double> d(30000000);
    cout << "Function done\n";
}

int main()
{
    test_deque();
    cout << "Main done\n";
}

Now it prints Function done and then there is the 33 second delay. So I assume this has to do with the destructor that gets executed when the function exits. But why does it take so long to destruct 240 MB of memory?

EDIT 2: Tried it (the second version) with GCC on Ubuntu and it only takes a fraction of a second to run! Same with some online C++ compilers. Is this a problem specific to Windows?

EDIT 3: With vector it also takes a fraction of a second to run. However, with list (and forward_list) I get a similar extremely long delay.

EDIT 4: Compiling with MSVC in Release (rather than Debug) configuration also takes a fraction of a second. I'm not sure what the GCC equivalent it, but with -O3 (max optimizations) the execution time remains 33 seconds.


Solution

  • MSVC has a built-in profiler. We can run it (press Alt-F2) to see that the majority of CPU time is spent inside the constructor and destructor, which invoke deque::resize() and deque::_Tidy() functions, respectively.

    msvc profiler

    If we drill down further we see that deque::emplace_back() is resulting in quite a lot of code

    #define _PUSH_BACK_BEGIN                                                                                \
        if ((_Myoff() + _Mysize()) % _DEQUESIZ == 0 && _Mapsize() <= (_Mysize() + _DEQUESIZ) / _DEQUESIZ) { \
            _Growmap(1);                                                                                    \
        }                                                                                                   \
        _Myoff() &= _Mapsize() * _DEQUESIZ - 1;                                                             \
        size_type _Newoff = _Myoff() + _Mysize();                                                           \
        size_type _Block  = _Getblock(_Newoff);                                                             \
        if (_Map()[_Block] == nullptr) {                                                                    \
            _Map()[_Block] = _Getal().allocate(_DEQUESIZ);                                                  \
        }
    
    #define _PUSH_BACK_END ++_Mysize()
    
        template <class... _Valty>
        decltype(auto) emplace_back(_Valty&&... _Val) {
            _Orphan_all();
            _PUSH_BACK_BEGIN;
            _Alty_traits::construct(
                _Getal(), _Unfancy(_Map()[_Block] + _Newoff % _DEQUESIZ), _STD forward<_Valty>(_Val)...);
            _PUSH_BACK_END;
    
    #if _HAS_CXX17
            return back();
    #endif // _HAS_CXX17
        }
    

    Disassembly view:

        template <class... _Valty>
        decltype(auto) emplace_back(_Valty&&... _Val) {
    00007FF674A238E0  mov         qword ptr [rsp+8],rcx  
    00007FF674A238E5  push        rbp  
    00007FF674A238E6  push        rdi  
    00007FF674A238E7  sub         rsp,138h  
    00007FF674A238EE  lea         rbp,[rsp+20h]  
    00007FF674A238F3  mov         rdi,rsp  
    00007FF674A238F6  mov         ecx,4Eh  
    00007FF674A238FB  mov         eax,0CCCCCCCCh  
    00007FF674A23900  rep stos    dword ptr [rdi]  
    00007FF674A23902  mov         rcx,qword ptr [rsp+158h]  
    00007FF674A2390A  lea         rcx,[__0657B1E2_deque (07FF674A3E02Fh)]  
    00007FF674A23911  call        __CheckForDebuggerJustMyCode (07FF674A21159h)  
            _Orphan_all();
    00007FF674A23916  mov         rcx,qword ptr [this]  
    00007FF674A2391D  call        std::deque<double,std::allocator<double> >::_Orphan_all (07FF674A217FDh)  
            _PUSH_BACK_BEGIN;
    00007FF674A23922  mov         rcx,qword ptr [this]  
    00007FF674A23929  call        std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)  
    00007FF674A2392E  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A23935  mov         rcx,qword ptr [this]  
    00007FF674A2393C  call        std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)  
    00007FF674A23941  mov         rcx,qword ptr [rbp+0F8h]  
    00007FF674A23948  mov         rcx,qword ptr [rcx]  
    00007FF674A2394B  add         rcx,qword ptr [rax]  
    00007FF674A2394E  mov         rax,rcx  
    00007FF674A23951  xor         edx,edx  
    00007FF674A23953  mov         ecx,2  
    00007FF674A23958  div         rax,rcx  
    00007FF674A2395B  mov         rax,rdx  
    00007FF674A2395E  test        rax,rax  
    00007FF674A23961  jne         std::deque<double,std::allocator<double> >::emplace_back<>+0D0h (07FF674A239B0h)  
    00007FF674A23963  mov         rcx,qword ptr [this]  
    00007FF674A2396A  call        std::deque<double,std::allocator<double> >::_Mapsize (07FF674A214BFh)  
    00007FF674A2396F  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A23976  mov         rcx,qword ptr [this]  
    00007FF674A2397D  call        std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)  
    00007FF674A23982  mov         rax,qword ptr [rax]  
    00007FF674A23985  add         rax,2  
    00007FF674A23989  xor         edx,edx  
    00007FF674A2398B  mov         ecx,2  
    00007FF674A23990  div         rax,rcx  
    00007FF674A23993  mov         rcx,qword ptr [rbp+0F8h]  
    00007FF674A2399A  cmp         qword ptr [rcx],rax  
    00007FF674A2399D  ja          std::deque<double,std::allocator<double> >::emplace_back<>+0D0h (07FF674A239B0h)  
    00007FF674A2399F  mov         edx,1  
    00007FF674A239A4  mov         rcx,qword ptr [this]  
    00007FF674A239AB  call        std::deque<double,std::allocator<double> >::_Growmap (07FF674A21640h)  
    00007FF674A239B0  mov         rcx,qword ptr [this]  
    00007FF674A239B7  call        std::deque<double,std::allocator<double> >::_Mapsize (07FF674A214BFh)  
    00007FF674A239BC  mov         rax,qword ptr [rax]  
    00007FF674A239BF  lea         rax,[rax+rax-1]  
    00007FF674A239C4  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A239CB  mov         rcx,qword ptr [this]  
    00007FF674A239D2  call        std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)  
    00007FF674A239D7  mov         qword ptr [rbp+100h],rax  
    00007FF674A239DE  mov         rax,qword ptr [rbp+100h]  
    00007FF674A239E5  mov         rax,qword ptr [rax]  
    00007FF674A239E8  mov         qword ptr [rbp+108h],rax  
    00007FF674A239EF  mov         rax,qword ptr [rbp+0F8h]  
    00007FF674A239F6  mov         rcx,qword ptr [rbp+108h]  
    00007FF674A239FD  and         rcx,rax  
    00007FF674A23A00  mov         rax,rcx  
    00007FF674A23A03  mov         rcx,qword ptr [rbp+100h]  
    00007FF674A23A0A  mov         qword ptr [rcx],rax  
    00007FF674A23A0D  mov         rcx,qword ptr [this]  
    00007FF674A23A14  call        std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)  
    00007FF674A23A19  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A23A20  mov         rcx,qword ptr [this]  
    00007FF674A23A27  call        std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)  
    00007FF674A23A2C  mov         rcx,qword ptr [rbp+0F8h]  
    00007FF674A23A33  mov         rcx,qword ptr [rcx]  
    00007FF674A23A36  add         rcx,qword ptr [rax]  
    00007FF674A23A39  mov         rax,rcx  
    00007FF674A23A3C  mov         qword ptr [_Newoff],rax  
    00007FF674A23A40  mov         rdx,qword ptr [_Newoff]  
    00007FF674A23A44  mov         rcx,qword ptr [this]  
    00007FF674A23A4B  call        std::deque<double,std::allocator<double> >::_Getblock (07FF674A21334h)  
    00007FF674A23A50  mov         qword ptr [_Block],rax  
    00007FF674A23A54  mov         rcx,qword ptr [this]  
    00007FF674A23A5B  call        std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)  
    00007FF674A23A60  mov         rax,qword ptr [rax]  
    00007FF674A23A63  mov         rcx,qword ptr [_Block]  
    00007FF674A23A67  cmp         qword ptr [rax+rcx*8],0  
    00007FF674A23A6C  jne         std::deque<double,std::allocator<double> >::emplace_back<>+1D7h (07FF674A23AB7h)  
    00007FF674A23A6E  mov         rcx,qword ptr [this]  
    00007FF674A23A75  call        std::deque<double,std::allocator<double> >::_Getal (07FF674A216CCh)  
    00007FF674A23A7A  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A23A81  mov         edx,2  
    00007FF674A23A86  mov         rcx,qword ptr [rbp+0F8h]  
    00007FF674A23A8D  call        std::allocator<double>::allocate (07FF674A216C7h)  
    00007FF674A23A92  mov         qword ptr [rbp+100h],rax  
    00007FF674A23A99  mov         rcx,qword ptr [this]  
    00007FF674A23AA0  call        std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)  
    00007FF674A23AA5  mov         rax,qword ptr [rax]  
    00007FF674A23AA8  mov         rcx,qword ptr [_Block]  
    00007FF674A23AAC  mov         rdx,qword ptr [rbp+100h]  
    00007FF674A23AB3  mov         qword ptr [rax+rcx*8],rdx  
            _Alty_traits::construct(
    00007FF674A23AB7  mov         rcx,qword ptr [this]  
    00007FF674A23ABE  call        std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)  
    00007FF674A23AC3  mov         rax,qword ptr [rax]  
    00007FF674A23AC6  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A23ACD  xor         edx,edx  
    00007FF674A23ACF  mov         rax,qword ptr [_Newoff]  
    00007FF674A23AD3  mov         ecx,2  
    00007FF674A23AD8  div         rax,rcx  
    00007FF674A23ADB  mov         rax,rdx  
    00007FF674A23ADE  mov         rcx,qword ptr [_Block]  
    00007FF674A23AE2  mov         rdx,qword ptr [rbp+0F8h]  
    00007FF674A23AE9  mov         rcx,qword ptr [rdx+rcx*8]  
    00007FF674A23AED  lea         rax,[rcx+rax*8]  
    00007FF674A23AF1  mov         rcx,rax  
    00007FF674A23AF4  call        std::_Unfancy<double> (07FF674A214A6h)  
    00007FF674A23AF9  mov         qword ptr [rbp+100h],rax  
    00007FF674A23B00  mov         rcx,qword ptr [this]  
    00007FF674A23B07  call        std::deque<double,std::allocator<double> >::_Getal (07FF674A216CCh)  
    00007FF674A23B0C  mov         qword ptr [rbp+108h],rax  
    00007FF674A23B13  mov         rdx,qword ptr [rbp+100h]  
    00007FF674A23B1A  mov         rcx,qword ptr [rbp+108h]  
    00007FF674A23B21  call        std::_Default_allocator_traits<std::allocator<double> >::construct<double> (07FF674A211E5h)  
                _Getal(), _Unfancy(_Map()[_Block] + _Newoff % _DEQUESIZ), _STD forward<_Valty>(_Val)...);
            _PUSH_BACK_END;
    00007FF674A23B26  mov         rcx,qword ptr [this]  
    00007FF674A23B2D  call        std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)  
    00007FF674A23B32  mov         qword ptr [rbp+0F8h],rax  
    00007FF674A23B39  mov         rax,qword ptr [rbp+0F8h]  
    00007FF674A23B40  mov         rax,qword ptr [rax]  
    00007FF674A23B43  inc         rax  
    00007FF674A23B46  mov         rcx,qword ptr [rbp+0F8h]  
    00007FF674A23B4D  mov         qword ptr [rcx],rax  
    
    #if _HAS_CXX17
            return back();
    00007FF674A23B50  mov         rcx,qword ptr [this]  
    00007FF674A23B57  call        std::deque<double,std::allocator<double> >::back (07FF674A2127Bh)  
    #endif // _HAS_CXX17
        }
    00007FF674A23B5C  lea         rsp,[rbp+118h]  
    00007FF674A23B63  pop         rdi  
    00007FF674A23B64  pop         rbp  
    00007FF674A23B65  ret
    

    Apparently std::deque doesn't pre-allocate elements, and instead uses a loop to add them one-by-one. So no wonder it is slow.

    You can speed up the Debug build by enabling some optimizations (e.g. /Ob1) and reducing runtime checks (e.g. remove /RTC1).

    But really, std::deque is just a horrible structure from a performance point of view (a vector of tiny vectors - not cache-friendly at all).