I wanted to write a dynamic array for trivial type (then I can use memcpy or sth to optimize), but when I compare its efficiency to std::vector
, I found its push_back
function is twice as efficient as astd::vector
.
That's so strange, I read source code of MSVC STL to find why, but in vain.
my code:
template<typename T>
class Array {
static_assert((std::is_trivial_v<T>), "Array requires type to be trivial.");
T* m_p;
size_t m_cap, m_siz;
private:
void increase(size_t new_cap) {
ASSERT(new_cap > m_cap);
if (new_cap < m_cap + m_cap / 2)new_cap = m_cap + m_cap / 2;
T* new_p = (T*)realloc(m_p, sizeof(T) * new_cap);
ASSERT(new_p);
m_p = new_p;
m_cap = new_cap;
}
public:
Array() : m_siz(0), m_cap(0), m_p(nullptr) {}
~Array() { if (m_p)free(m_p); }
Array(const Array& x) {
m_cap = m_siz = x.m_siz;
m_p = (T*)malloc(sizeof(T) * m_siz);
memcpy(m_p, x.m_p, sizeof(T) * m_siz);
}
Array(Array&& x) {
m_cap = x.m_cap;
m_siz = x.m_siz;
m_p = x.m_p;
x.m_p = nullptr;
x.m_cap = x.m_siz = 0;
}
Array& operator=(const Array& x) {
m_cap = m_siz = x.m_siz;
m_p = (T*)malloc(sizeof(T) * m_siz);
memcpy(m_p, x.m_p, sizeof(T) * m_siz);
}
Array& operator=(Array&& x) {
m_cap = x.m_cap;
m_siz = x.m_siz;
m_p = x.m_p;
x.m_p = nullptr;
x.m_cap = x.m_siz = 0;
}
void push_back(const T& x) {
if (m_siz == m_cap)increase(m_cap + 1);
m_p[m_siz++] = x;
}
void reserve(size_t cap) {
if (cap > m_cap)
increase(cap);
}
void resize(size_t siz, const T& val = T{}) {
if (siz > m_siz) {
if (siz > m_cap)increase(siz);
for (size_t i = m_siz; i < siz; i++)
m_p[i] = val;
}
m_siz = siz;
}
void shrink_to_fit() {
m_p = realloc(m_p, m_siz * sizeof(T));
m_cap = m_siz;
}
T& operator[](size_t pos) { ASSERT(pos < m_siz); return m_p[pos]; }
const T& operator[](size_t pos) const { ASSERT(pos < m_siz); return m_p[pos]; }
size_t size()const { return m_siz; }
size_t capacity()const { return m_cap; }
void clear() { m_siz = 0; }
};
#define N 10000
#define M 100000
template<typename T>
int test() {
int t = clock();
T v;
v.reserve(M);
for (int t = 0; t < N; t++) {
for (int i = 0; i < M; i++)
v.push_back(i);
//v.resize(M);
//for (int i = 0; i < M; i++)
//v[i] = -i;
v.clear();
}
return clock() - t;
}
int main() {
int t1 = test<std::vector<int>>();
int t2 = test<Array<int>>();
printf("vector:%dms.\nArray:%dms.\n", t1, t2);
}
result(Release in VS2019):
vector:1043ms.
Array:547ms.
This is the invoke chain of std::vector::pushback
(MSVC):
void push_back(const _Ty& _Val) { // insert by moving into element at end, provide strong guarantee
emplace_back(_STD move(_Val));
}
template <class... _Valty>
decltype(auto) emplace_back(_Valty&&... _Val) {
// insert by perfectly forwarding into element at end, provide strong guarantee
auto& _My_data = _Mypair._Myval2;
pointer& _Mylast = _My_data._Mylast;
if (_Mylast != _My_data._Myend) {
return _Emplace_back_with_unused_capacity(_STD forward<_Valty>(_Val)...);
}
_Ty& _Result = *_Emplace_reallocate(_Mylast, _STD forward<_Valty>(_Val)...);
#if _HAS_CXX17
return _Result;
#else // ^^^ _HAS_CXX17 ^^^ // vvv !_HAS_CXX17 vvv
(void) _Result;
#endif // _HAS_CXX17
}
template <class... _Valty>
decltype(auto) _Emplace_back_with_unused_capacity(_Valty&&... _Val) {
// insert by perfectly forwarding into element at end, provide strong guarantee
auto& _My_data = _Mypair._Myval2;
pointer& _Mylast = _My_data._Mylast;
_STL_INTERNAL_CHECK(_Mylast != _My_data._Myend); // check that we have unused capacity
_Alty_traits::construct(_Getal(), _Unfancy(_Mylast), _STD forward<_Valty>(_Val)...);
_Orphan_range(_Mylast, _Mylast);
_Ty& _Result = *_Mylast;
++_Mylast;
#if _HAS_CXX17
return _Result;
#else // ^^^ _HAS_CXX17 ^^^ // vvv !_HAS_CXX17 vvv
(void) _Result;
#endif // _HAS_CXX17
}
After inline and other optimizations, the STL code doesn't seem to make much difference from mine.
Could anyone tell me why is my push_back is more efficient(or why STL is slow)?
EDIT: Sorry, my question seemd to caused a misunderstanding.
I know that std::vector
is general, so it might do more work than mine.
But in a push_back
of type int
, I don't think std::vector
did something special, so why it is slow?
The difference is mainly due to less favorable codegen in the std::vector
version. We can see it if we compare the generated assembly (godbolt link).
Your loop (skipping the reallocation part):
$LL4@test:
xor esi, esi
xor edi, edi
$LL7@test:
mov r15, rbx
cmp rdi, rbx
jne SHORT $LN27@test
<...skip...>
$LN27@test:
mov DWORD PTR [r14+rdi*4], esi
inc rdi
inc esi
cmp esi, 100000 ; 000186a0H
jl $LL7@test
sub r12, r13
jne $LL4@test
std::vector::push_back
loop (again, skipping the reallocation part):
$LL4@test:
xor ebx, ebx
mov DWORD PTR i$1[rsp], ebx
$LL7@test:
cmp rcx, QWORD PTR v$[rsp+16]
je SHORT $LN26@test
mov DWORD PTR [rcx], ebx
mov rcx, QWORD PTR v$[rsp+8]
add rcx, 4
mov QWORD PTR v$[rsp+8], rcx
jmp SHORT $LN5@test
$LN26@test:
<...skip...>
$LN5@test:
inc ebx
mov DWORD PTR i$1[rsp], ebx
cmp ebx, 100000 ; 000186a0H
jl SHORT $LL7@test
mov rcx, QWORD PTR v$[rsp]
mov QWORD PTR v$[rsp+8], rcx
sub rdi, 1
jne SHORT $LL4@test
Clearly we can see more code (11 vs. 8 instructions in the hot path) and more indirection to memory (5 vs. 1). So it's no surprise that it's slower.
More generally, more complex code == slower code.
Could the two versions be optimized identically? I see no reason why not. MSVC 19.28 just can't do it.