The snippet of code found below demonstrates a situation in which calling the CRC32 compiler intrinsic on 7 bytes of data in two different ways (e.g. case0() & case1()) results in different compiler optimizations. These differences in compiler optimizations produce vastly different execution times (e.g. [Test_Construction, Case: 0, Bytes: 7]).
For reference, I have included logic for calling CRC32 on 6 bytes of data in the same fashion. However, as you can see from the generated output, the resulting execution times do not suffer the same performance hit as experienced when working with 7 bytes of data.
Generated output of a single pass - 4 unique tests for each data size of interest (6 & 7 bytes):
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
I have two questions:
- Why is the compiler producing different optimizations (e.g. specifically in the case of [Test_Construction, Case: 0, Bytes: 7]?
- It looks like when [Test_Construction, Case: 0, Bytes: 7] is translated into machine code it contains additional instructions which move data from the stack into the registers and then back out on the stack. This does not seem to occur in any other scenario. CRC is then called once on data found within a register and once on data on the stack. Why would it do this?
- Why is the performance dropping in the first place?
- Is it due to the additional stack logic (memory operations) found in [Test_Construction, Case: 0, Bytes: 7] machine code?
- Could the order of the operations contribute?
- Is there a way to stop the optimizer from producing this suboptimal machine code?
UPDATE 1 - 4/7/17:
.
template<int N>
void MemCpy(char* szDst, const char* szSrc) {
memcpy(szDst, szSrc, N);
}
// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
//AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4), AS1(szDst+6) = AS1(szSrc+6);
AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}
Environment Details:
Windows Server 2012 R2 x64
Intel Xeon X5670
Assembly Reference:
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
-------------------------------------------------------
00007FF62D7911CC call CBench::CBench (07FF62D791000h)
00007FF62D7911D1 xor r8d,r8d
00007FF62D7911D4 lea r10,[_a (07FF62D794630h)]
00007FF62D7911DB mov r9d,1312D00h
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1 mov rax,r8
00007FF62D7911E4 inc r8
00007FF62D7911E7 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC lea rcx,[rax+rax*2]
00007FF62D7911F0 movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7911F6 mov edx,dword ptr [r10+rcx*8]
00007FF62D7911FA mov word ptr [rsp+44h],ax
00007FF62D7911FF movzx eax,byte ptr [r10+rcx*8+6]
00007FF62D791205 mov byte ptr [rsp+46h],al
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209 mov eax,7
00007FF62D79120E crc32 eax,edx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213 mov dword ptr [buf],edx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217 crc32 eax,dword ptr [rsp+43h]
00007FF62D79121E add ebx,eax
00007FF62D791220 sub r9,1
00007FF62D791224 jne Test_Func<0,7,0>+71h (07FF62D7911E1h)
}
return ii;
00007FF62D791226 lea rcx,[Bench]
00007FF62D79122B call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
-------------------------------------------------------
00007FF62D7912A9 call CBench::CBench (07FF62D791000h)
00007FF62D7912AE xor r8d,r8d
00007FF62D7912B1 lea r10,[_a (07FF62D794630h)]
00007FF62D7912B8 mov r9d,1312D00h
00007FF62D7912BE xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0 mov rax,r8
00007FF62D7912C3 inc r8
00007FF62D7912C6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7912D5 movzx edx,byte ptr [r10+rcx*8+6]
00007FF62D7912DB shl rdx,10h
00007FF62D7912DF or rdx,rax
00007FF62D7912E2 mov eax,dword ptr [r10+rcx*8]
00007FF62D7912E6 shl rdx,20h
00007FF62D7912EA or rdx,rax
00007FF62D7912ED mov eax,7
00007FF62D7912F2 crc32 rax,rdx
00007FF62D7912F8 add ebx,eax
00007FF62D7912FA sub r9,1
00007FF62D7912FE jne Test_Func<1,7,0>+70h (07FF62D7912C0h)
}
return ii;
00007FF62D791300 lea rcx,[Bench]
00007FF62D791305 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
-------------------------------------------------------
00007FF62D791386 call CBench::CBench (07FF62D791000h)
00007FF62D79138B xor edx,edx
00007FF62D79138D lea r9,[_a (07FF62D794630h)]
00007FF62D791394 mov r8d,1312D00h
00007FF62D79139A nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3 mov ecx,7
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8 and eax,3FFh
00007FF62D7913AD inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4 crc32 ecx,dword ptr [r9+rax*8]
00007FF62D7913BB crc32 ecx,dword ptr [r9+rax*8+3]
00007FF62D7913C3 add ebx,ecx
00007FF62D7913C5 sub r8,1
00007FF62D7913C9 jne Test_Func<0,7,1>+70h (07FF62D7913A0h)
}
return ii;
00007FF62D7913CB lea rcx,[Bench]
00007FF62D7913D0 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
-------------------------------------------------------
00007FF62D791449 call CBench::CBench (07FF62D791000h)
00007FF62D79144E xor r8d,r8d
00007FF62D791451 lea r10,[_a (07FF62D794630h)]
00007FF62D791458 mov r9d,1312D00h
00007FF62D79145E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460 mov rax,r8
00007FF62D791463 inc r8
00007FF62D791466 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F movzx edx,byte ptr [r10+rax*8+6]
00007FF62D791475 lea rcx,[r10+rax*8]
00007FF62D791479 movzx eax,word ptr [r10+rax*8+4]
00007FF62D79147F shl rdx,10h
00007FF62D791483 or rdx,rax
00007FF62D791486 mov eax,dword ptr [rcx]
00007FF62D791488 shl rdx,20h
00007FF62D79148C or rdx,rax
00007FF62D79148F mov eax,7
00007FF62D791494 crc32 rax,rdx
00007FF62D79149A add ebx,eax
00007FF62D79149C sub r9,1
00007FF62D7914A0 jne Test_Func<1,7,1>+70h (07FF62D791460h)
}
return ii;
00007FF62D7914A2 lea rcx,[Bench]
00007FF62D7914A7 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
-------------------------------------------------------
00007FF62D791526 call CBench::CBench (07FF62D791000h)
00007FF62D79152B xor r8d,r8d
00007FF62D79152E lea r10,[_a (07FF62D794630h)]
00007FF62D791535 mov r9d,1312D00h
00007FF62D79153B nop dword ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540 mov rax,r8
00007FF62D791543 inc r8
00007FF62D791546 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F mov eax,6
00007FF62D791554 crc32 eax,dword ptr [r10+rcx*8]
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B movzx edx,word ptr [r10+rcx*8+4]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561 crc32 eax,dx
00007FF62D791567 add ebx,eax
00007FF62D791569 sub r9,1
00007FF62D79156D jne Test_Func<0,6,0>+70h (07FF62D791540h)
}
return ii;
00007FF62D79156F lea rcx,[Bench]
00007FF62D791574 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
-------------------------------------------------------
00007FF62D7915F9 call CBench::CBench (07FF62D791000h)
00007FF62D7915FE xor r8d,r8d
00007FF62D791601 lea r10,[_a (07FF62D794630h)]
00007FF62D791608 mov r9d,1312D00h
00007FF62D79160E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610 mov rax,r8
00007FF62D791613 inc r8
00007FF62D791616 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F mov eax,dword ptr [r10+rcx*8]
00007FF62D791623 movzx edx,word ptr [r10+rcx*8+4]
00007FF62D791629 shl rdx,20h
00007FF62D79162D or rdx,rax
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630 mov eax,6
00007FF62D791635 crc32 rax,rdx
00007FF62D79163B add ebx,eax
00007FF62D79163D sub r9,1
00007FF62D791641 jne Test_Func<1,6,0>+70h (07FF62D791610h)
}
return ii;
00007FF62D791643 lea rcx,[Bench]
00007FF62D791648 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
-------------------------------------------------------
00007FF62D7916C6 call CBench::CBench (07FF62D791000h)
00007FF62D7916CB xor edx,edx
00007FF62D7916CD lea r9,[_a (07FF62D794630h)]
00007FF62D7916D4 mov r8d,1312D00h
00007FF62D7916DA nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3 mov ecx,6
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8 and eax,3FFh
00007FF62D7916ED inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4 crc32 ecx,dword ptr [r9+rax*8]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB crc32 ecx,word ptr [r9+rax*8+4]
00007FF62D791704 add ebx,ecx
00007FF62D791706 sub r8,1
00007FF62D79170A jne Test_Func<0,6,1>+70h (07FF62D7916E0h)
}
return ii;
00007FF62D79170C lea rcx,[Bench]
00007FF62D791711 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
-------------------------------------------------------
00007FF62D791799 call CBench::CBench (07FF62D791000h)
00007FF62D79179E xor edx,edx
00007FF62D7917A0 lea r9,[_a (07FF62D794630h)]
00007FF62D7917A7 mov r8d,1312D00h
00007FF62D7917AD nop dword ptr [rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0 mov rax,rdx
00007FF62D7917B3 inc rdx
00007FF62D7917B6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF movzx ecx,word ptr [r9+rax*8+4]
00007FF62D7917C5 mov eax,dword ptr [r9+rax*8]
00007FF62D7917C9 shl rcx,20h
00007FF62D7917CD or rcx,rax
00007FF62D7917D0 mov eax,6
00007FF62D7917D5 crc32 rax,rcx
00007FF62D7917DB add ebx,eax
00007FF62D7917DD sub r8,1
00007FF62D7917E1 jne Test_Func<1,6,1>+70h (07FF62D7917B0h)
}
return ii;
00007FF62D7917E3 lea rcx,[Bench]
00007FF62D7917E8 call CBench::~CBench (07FF62D791030h)
Source Code:
#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>
#define DimensionOf(x) (sizeof(x)/sizeof(*(x)))
#define INL __forceinline
#define NOINL __declspec(noinline)
#define PASSES 20000000
#define AS1(a_) (*(U1*)(a_))
#define AS2(a_) (*(U2*)(a_))
#define AS3(a_) ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_) (*(U4*)(a_))
#define AS6(a_) ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_) ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))
typedef unsigned char U1;
typedef unsigned short U2;
typedef unsigned int U4;
typedef unsigned long long U8;
typedef char TData[24];
TData _a[0x400];
// CBench is for benchmarking code
class CBench {
__int64 m_nStart;
const char* m_desc;
public:
// No inline declared
// Reasoning: Simplifies the assembly code.
// Easier to see how the optimizer optimizes different variations of an algorithm.
NOINL CBench(const char *szDesc)
: m_desc(szDesc), m_nStart(GetBenchMark()) { }
NOINL ~CBench() {
__int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
}
NOINL static __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
return nBenchMark;
}
};
// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
char m_ach[N];
INL CFunc(const char* sz) {
memcpy(m_ach, sz, N);
}
INL U4 Case0() {
return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
: _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
}
INL U4 Case1() {
return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
: (U4) _mm_crc32_u64(N, AS6(m_ach));
}
};
// Evaluates performance dependent on:
// - CASE : CRC procedure
// - N : Number of bytes
// - USEREF : True, reference to pre-existing CFunc object
// False, constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
char szDesc[64], buf[64];
(USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N)
: sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
CBench Bench(szDesc);
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
ii += (CASE == 1) ? x.Case1() : x.Case0();
}
return ii;
}
int main(int argc, char* argv[]) {
for (int i = 0; i < 10; ++i) {
printf("\n>>>>\tPass %d:\n", i);
// Execute CRC on 7 bytes
// Construct CFunc Object
argc = Test_Func<0, 7, false>(argc);
argc = Test_Func<1, 7, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 7, true>(argc);
argc = Test_Func<1, 7, true>(argc);
// Execute CRC on 6 bytes
// Construct CFunc Object
argc = Test_Func<0, 6, false>(argc);
argc = Test_Func<1, 6, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 6, true>(argc);
argc = Test_Func<1, 6, true>(argc);
}
printf("\n\nDone\n");
return argc;
}
The operations the compiler uses to copy the data into the 7 byte buffer populate registers differently than the crc32 call(s) require(s). The compiler has to go to the stack to get the registers needed for the crc32 call. There is no combination of 1,2,4 byte reads and writes that doesn't require a full write to the stack. When I copied the 7 bytes to an 8 byte buffer, duplicating the middle byte with a second unaligned 4 byte mov, the compiler saw 2 registers already populated for the crc32 calls and eliminated the stack read/write.
125.997 ms: Use memcpy, which does aligned copying, and unaligned crc32:
memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
movzx eax,word ptr [_a[i]+4]
mov edx,dword ptr [_a[i]]
mov word ptr [buf+4],ax
movzx eax,byte ptr [_a[i]+6]
mov byte ptr [buf+6],al
xor eax,eax
crc32 eax,edx
mov dword ptr [buf],edx
crc32 eax,dword ptr [buf+3]
The first call to crc32 can use the register edx from the copy, but the second call has no register ready. It needs the result of the DWORD, WORD, and BYTE movs into buf. On top of this I suspect the compiler sees a bunch of aliasing going on here and gets conservative. The compiler has no choice but to build buf on the stack and then access it.
137.044 ms: memcpy<7>, unaligned overlapped copy into 7 char buf, suffers from the same problem. The registers involved in the copy step are not the registers needed for the crc32 step. It has a bit more unaligned access, so it slows down a bit:
AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
mov eax,dword ptr [_a[i]]
mov ecx,dword ptr [_a[i]+3]
mov dword ptr [buf],eax
xor eax,eax
mov dword ptr [buf+3],ecx
crc32 eax,dword ptr [buf]
crc32 eax,ecx
16.733 ms: unaligned overlapped access to source but not overlapped into an 8 byte dest buf, sees a massive improvement! In this case, we copy the middle byte twice, but we never alias the DWORDS in buf. If _a[i] = "1234567", then buf would be "12344567":
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
xor eax,eax
crc32 eax,dword ptr [_a[i]]
crc32 eax,dword ptr [_a[i]+3]
The call to copy first DWORD into buf and the call to copy the second DWORD into buf + 4 use 2 separate registers, which can passed to crc32 directly, so no need to use buf. The optimizer on a subsequent pass notices the unused data moved to the stack and removes the related operations.
121.500 ms: I then tried the 64 bit crc on the 8 char buf built the same way as above and lost big. The compiler is not using a single 8 byte register to do the move to buf.
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
mov eax,dword ptr [_a[i]]
mov dword ptr [buf],eax
mov eax,dword ptr [_a[i]+3]
mov dword ptr [buf+3],eax
xor eax,eax
crc32 rax,qword ptr [buf]
20.799 ms: I changed the move to buf to be 8 bytes instead of 2 x 4 bytes. This stopped using the stack, but still underperformed the 3rd method above:
AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
mov ecx,dword ptr [_a[i]+3]
mov eax,dword ptr [_a[i]]
shl rcx,20h
or rcx,rax
xor eax,eax
crc32 rax,rcx
1 took: 125.997 ms 2 took: 137.044 ms 3 took: 16.733 ms 4 took: 121.500 ms 5 took: 20.799 ms