Which classes of algorithms could be using punpcklbw
?
In particular, what is punpcklbw xmm0, xmm0
doing ?
And yet, what is maskedPow2_Value
useful for ?
maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val; // Val is int maskedPow2_Value = 0x101010101010101i64 * maskedValue;
(or mov r9, 101010101010101h; imul rdx, r9;
twice)
A complete example (the function is named CompressPacket but it may be misleading), asresult of decompilation by IDA:
void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
{
__int64 maskedPow2_Value; // rdx
unsigned int v5; // ecx
__int64 *bufferOut; // rcx
size_t size_; // r9
size_t i; // r9
size_t size__; // r9
size_t counter; // r8
size_t j; // r9
void *result; // rax
__m128i v13; // xmm0
__int64 lsb4; // rax
size_t counter1; // r9
size_t k; // r9
size_t lsb4_; // r8
__int64 maskedValue; // rdx
*(_QWORD *)&Val = (unsigned __int8)Val;
maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val;
bufferOut = (__int64 *)((char *)Dst + Size);
result = Dst;
switch ( Size )
{
case 0ui64:
return result;
case 1ui64:
goto LBL_1_F;
case 2ui64:
goto LBL_2_E;
case 3ui64:
goto LBL_3_F;
case 4ui64:
goto LBL_4_C;
case 5ui64:
goto LBL_5_D;
case 6ui64:
goto LBL_6_E;
case 7ui64:
goto LBL_7_F;
case 8ui64:
*(bufferOut - 1) = maskedValue;
return result;
case 9ui64:
*(__int64 *)((char *)bufferOut - 9) = maskedValue;
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
case 0xAui64:
*(__int64 *)((char *)bufferOut - 10) = maskedValue;
*((_WORD *)bufferOut - 1) = maskedValue;
return result;
case 0xBui64:
*(__int64 *)((char *)bufferOut - 11) = maskedValue;
goto LBL_3_F;
case 0xCui64:
*(__int64 *)((char *)bufferOut - 12) = maskedValue;
LBL_4_C:
*((_DWORD *)bufferOut - 1) = maskedValue;
return result;
case 0xDui64:
*(__int64 *)((char *)bufferOut - 13) = maskedValue;
LBL_5_D:
*(_DWORD *)((char *)bufferOut - 5) = maskedValue;
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
case 0xEui64:
*(__int64 *)((char *)bufferOut - 14) = maskedValue;
LBL_6_E:
*(_DWORD *)((char *)bufferOut - 6) = maskedValue;
LBL_2_E:
*((_WORD *)bufferOut - 1) = maskedValue;
return result;
case 0xFui64:
*(__int64 *)((char *)bufferOut - 15) = maskedValue;
LBL_7_F:
*(_DWORD *)((char *)bufferOut - 7) = maskedValue;
LBL_3_F:
*(_WORD *)((char *)bufferOut - 3) = maskedValue;
LBL_1_F:
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
default:
if ( _bittest(dword_7FFFF4B237D8, 1u) )
{
memset(bufferOut, maskedValue, Size);
return Dst;
}
maskedPow2_Value = 0x101010101010101i64 * maskedValue;
if ( !_bittest(dword_7FFFF4B237D8, 2u) )
{
if ( Size >= 0x40 )
{
v5 = -(int)bufferOut & 7;
if ( v5 )
{
Size -= v5;
*(_QWORD *)Dst = maskedPow2_Value;
}
bufferOut = (__int64 *)((char *)Dst + v5);
size_ = Size;
Size &= 0x3Fu;
for ( i = size_ >> 6; i; *(bufferOut - 1) = maskedPow2_Value )
{
*bufferOut = maskedPow2_Value;
bufferOut[1] = maskedPow2_Value;
bufferOut[2] = maskedPow2_Value;
bufferOut += 8;
*(bufferOut - 5) = maskedPow2_Value;
*(bufferOut - 4) = maskedPow2_Value;
--i;
*(bufferOut - 3) = maskedPow2_Value;
*(bufferOut - 2) = maskedPow2_Value;
}
}
size__ = Size;
counter = Size & 7;
for ( j = size__ >> 3; j; --j )
*bufferOut++ = maskedPow2_Value;
for ( ; counter; --counter )
{
*(_BYTE *)bufferOut = maskedPow2_Value;
bufferOut = (__int64 *)((char *)bufferOut + 1);
}
return Dst;
}
v13 = _mm_unpacklo_epi8((__m128i)(unsigned __int64)maskedPow2_Value, (__m128i)(unsigned __int64)maskedPow2_Value);
if ( ((unsigned __int8)bufferOut & 0xF) != 0 )
{
*(__m128i *)bufferOut = v13;
lsb4 = (unsigned __int8)bufferOut & 0xF;
bufferOut = (__int64 *)((char *)bufferOut - lsb4 + 16);
Size = lsb4 + Size - 16;
}
counter1 = Size >> 7;
if ( Size >> 7 )
{
do
{
*(__m128i *)bufferOut = v13;
*((__m128i *)bufferOut + 1) = v13;
bufferOut += 16;
*((__m128i *)bufferOut - 6) = v13;
*((__m128i *)bufferOut - 5) = v13;
--counter1;
*((__m128i *)bufferOut - 4) = v13;
*((__m128i *)bufferOut - 3) = v13;
*((__m128i *)bufferOut - 2) = v13;
*((__m128i *)bufferOut - 1) = v13;
}
while ( counter1 );
Size &= 0x7Fu;
}
for ( k = Size >> 4; k; --k )
{
*(__m128i *)bufferOut = v13;
bufferOut += 2;
}
lsb4_ = Size & 0xF;
if ( lsb4_ )
*(__m128i *)((char *)bufferOut + lsb4_ - 16) = v13;
return Dst;
}
}
and the disassembly, by IDA too:
.text:00007FFFF4AF6440 ; void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
.text:00007FFFF4AF6440 CompressPacket proc near ; CODE XREF: j_memset↑j
.text:00007FFFF4AF6440 ; Concurrency::details::ResourceManager::CreateAllocatedNodeData(void)+49↑p ...
.text:00007FFFF4AF6440 mov r11, rcx
.text:00007FFFF4AF6443 movzx edx, dl ; Move with Zero-Extend
.text:00007FFFF4AF6446 cmp r8, 10h ; switch 16 cases
.text:00007FFFF4AF644A jb SetBytes15 ; Jump if Below (CF=1)
.text:00007FFFF4AF6450
.text:00007FFFF4AF6450 def_7FFFF4AF65D2: ; jumptable 00007FFFF4AF65D2 default case
.text:00007FFFF4AF6450 bt cs:dword_7FFFF4B237D8, 1
.text:00007FFFF4AF6458 jnb short mset05 ; Jump if Not Below (CF=0)
.text:00007FFFF4AF645A push rdi
.text:00007FFFF4AF645B mov rdi, rcx
.text:00007FFFF4AF645E mov eax, edx
.text:00007FFFF4AF6460 mov rcx, r8
.text:00007FFFF4AF6463 rep stosb ; Store String
.text:00007FFFF4AF6465 pop rdi
.text:00007FFFF4AF6466 jmp short mset60 ; Jump
.text:00007FFFF4AF6468 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6468
.text:00007FFFF4AF6468 mset05: ; CODE XREF: CompressPacket+18↑j
.text:00007FFFF4AF6468 mov r9, 101010101010101h
.text:00007FFFF4AF6472 imul rdx, r9 ; Signed Multiply
.text:00007FFFF4AF6476 bt cs:dword_7FFFF4B237D8, 2 ; Bit Test
.text:00007FFFF4AF647E jb msetxmm10 ; Jump if Below (CF=1)
.text:00007FFFF4AF6484 cmp r8, 40h ; '@' ; Compare Two Operands
.text:00007FFFF4AF6488 jb short mset20 ; Jump if Below (CF=1)
.text:00007FFFF4AF648A neg rcx ; Two's Complement Negation
.text:00007FFFF4AF648D and ecx, 7 ; Logical AND
.text:00007FFFF4AF6490 jz short mset10 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6492 sub r8, rcx ; Integer Subtraction
.text:00007FFFF4AF6495 mov [r11], rdx
.text:00007FFFF4AF6498
.text:00007FFFF4AF6498 mset10: ; CODE XREF: CompressPacket+50↑j
.text:00007FFFF4AF6498 add rcx, r11 ; Add
.text:00007FFFF4AF649B mov r9, r8
.text:00007FFFF4AF649E and r8, 3Fh ; Logical AND
.text:00007FFFF4AF64A2 shr r9, 6 ; Shift Logical Right
.text:00007FFFF4AF64A6 jnz short mset80 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64A8
.text:00007FFFF4AF64A8 mset20: ; CODE XREF: CompressPacket+48↑j
.text:00007FFFF4AF64A8 ; CompressPacket+CF↓j
.text:00007FFFF4AF64A8 mov r9, r8
.text:00007FFFF4AF64AB and r8, 7 ; Logical AND
.text:00007FFFF4AF64AF shr r9, 3 ; Shift Logical Right
.text:00007FFFF4AF64B3 jz short mset40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64B5 db 66h, 66h
.text:00007FFFF4AF64B5 xchg ax, ax ; Exchange Register/Memory with Register
.text:00007FFFF4AF64B9 nop ; No Operation
.text:00007FFFF4AF64BA
.text:00007FFFF4AF64BA mset30: ; CODE XREF: CompressPacket+84↓j
.text:00007FFFF4AF64BA mov [rcx], rdx
.text:00007FFFF4AF64BD add rcx, 8 ; Add
.text:00007FFFF4AF64C1 dec r9 ; Decrement by 1
.text:00007FFFF4AF64C4 jnz short mset30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64C6
.text:00007FFFF4AF64C6 mset40: ; CODE XREF: CompressPacket+73↑j
.text:00007FFFF4AF64C6 test r8, r8 ; Logical Compare
.text:00007FFFF4AF64C9 jz short mset60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64CB
.text:00007FFFF4AF64CB mset50: ; CODE XREF: CompressPacket+93↓j
.text:00007FFFF4AF64CB mov [rcx], dl
.text:00007FFFF4AF64CD inc rcx ; Increment by 1
.text:00007FFFF4AF64D0 dec r8 ; Decrement by 1
.text:00007FFFF4AF64D3 jnz short mset50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64D5
.text:00007FFFF4AF64D5 mset60: ; CODE XREF: CompressPacket+26↑j
.text:00007FFFF4AF64D5 ; CompressPacket+89↑j
.text:00007FFFF4AF64D5 mov rax, r11
.text:00007FFFF4AF64D8 retn ; Return Near from Procedure
.text:00007FFFF4AF64D8 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64D9 db 0Fh, 1Fh, 80h, 4 dup(0)
.text:00007FFFF4AF64E0 db 3 dup(66h), 90h
.text:00007FFFF4AF64E4 db 2 dup(66h), 90h
.text:00007FFFF4AF64E7 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64E7
.text:00007FFFF4AF64E7 mset80: ; CODE XREF: CompressPacket+66↑j
.text:00007FFFF4AF64E7 ; CompressPacket+CD↓j
.text:00007FFFF4AF64E7 mov [rcx], rdx
.text:00007FFFF4AF64EA mov [rcx+8], rdx
.text:00007FFFF4AF64EE mov [rcx+10h], rdx
.text:00007FFFF4AF64F2 add rcx, 40h ; '@' ; Add
.text:00007FFFF4AF64F6 mov [rcx-28h], rdx
.text:00007FFFF4AF64FA mov [rcx-20h], rdx
.text:00007FFFF4AF64FE dec r9 ; Decrement by 1
.text:00007FFFF4AF6501 mov [rcx-18h], rdx
.text:00007FFFF4AF6505 mov [rcx-10h], rdx
.text:00007FFFF4AF6509 mov [rcx-8], rdx
.text:00007FFFF4AF650D jnz short mset80 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF650F jmp short mset20 ; Jump
.text:00007FFFF4AF650F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6511 align 20h
.text:00007FFFF4AF6520
.text:00007FFFF4AF6520 msetxmm10: ; CODE XREF: CompressPacket+3E↑j
.text:00007FFFF4AF6520 movq xmm0, rdx ; Move 64 bits
.text:00007FFFF4AF6525 punpcklbw xmm0, xmm0 ; Unpack Low Packed Data (Byte->Word)
.text:00007FFFF4AF6529 test cl, 0Fh ; Logical Compare
.text:00007FFFF4AF652C jz short msetxmm20 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF652E movups xmmword ptr [rcx], xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF6531 mov rax, rcx
.text:00007FFFF4AF6534 and rax, 0Fh ; Logical AND
.text:00007FFFF4AF6538 add rcx, 10h ; Add
.text:00007FFFF4AF653C sub rcx, rax ; Integer Subtraction
.text:00007FFFF4AF653F lea r8, [rax+r8-10h] ; Load Effective Address
.text:00007FFFF4AF6544
.text:00007FFFF4AF6544 msetxmm20: ; CODE XREF: CompressPacket+EC↑j
.text:00007FFFF4AF6544 mov r9, r8
.text:00007FFFF4AF6547 shr r9, 7 ; Shift Logical Right
.text:00007FFFF4AF654B jz short msetxmm40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF654D jmp short msetxmm30 ; Jump
.text:00007FFFF4AF654D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF654F align 10h
.text:00007FFFF4AF6550
.text:00007FFFF4AF6550 msetxmm30: ; CODE XREF: CompressPacket+10D↑j
.text:00007FFFF4AF6550 ; CompressPacket+139↓j
.text:00007FFFF4AF6550 movaps xmmword ptr [rcx], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6553 movaps xmmword ptr [rcx+10h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6557 add rcx, 80h ; '€' ; Add
.text:00007FFFF4AF655E movaps xmmword ptr [rcx-60h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6562 movaps xmmword ptr [rcx-50h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6566 dec r9 ; Decrement by 1
.text:00007FFFF4AF6569 movaps xmmword ptr [rcx-40h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF656D movaps xmmword ptr [rcx-30h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6571 movaps xmmword ptr [rcx-20h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6575 movaps xmmword ptr [rcx-10h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6579 jnz short msetxmm30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF657B and r8, 7Fh ; Logical AND
.text:00007FFFF4AF657F
.text:00007FFFF4AF657F msetxmm40: ; CODE XREF: CompressPacket+10B↑j
.text:00007FFFF4AF657F mov r9, r8
.text:00007FFFF4AF6582 shr r9, 4 ; Shift Logical Right
.text:00007FFFF4AF6586 jz short msetxmm60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6588 nop dword ptr [rax+rax+00000000h] ; No Operation
.text:00007FFFF4AF6590
.text:00007FFFF4AF6590 msetxmm50: ; CODE XREF: CompressPacket+15A↓j
.text:00007FFFF4AF6590 movaps xmmword ptr [rcx], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6593 add rcx, 10h ; Add
.text:00007FFFF4AF6597 dec r9 ; Decrement by 1
.text:00007FFFF4AF659A jnz short msetxmm50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF659C
.text:00007FFFF4AF659C msetxmm60: ; CODE XREF: CompressPacket+146↑j
.text:00007FFFF4AF659C and r8, 0Fh ; Logical AND
.text:00007FFFF4AF65A0 jz short msetxmm70 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF65A2 movups xmmword ptr [r8+rcx-10h], xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF65A8
.text:00007FFFF4AF65A8 msetxmm70: ; CODE XREF: CompressPacket+160↑j
.text:00007FFFF4AF65A8 mov rax, r11
.text:00007FFFF4AF65AB retn ; Return Near from Procedure
.text:00007FFFF4AF65AC ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65AC
.text:00007FFFF4AF65AC SetBytes15: ; CODE XREF: CompressPacket+A↑j
.text:00007FFFF4AF65AC mov r9, 101010101010101h
.text:00007FFFF4AF65B6 imul rdx, r9 ; Signed Multiply
.text:00007FFFF4AF65BA lea r9, cs:7FFFF4AB0000h ; Load Effective Address
.text:00007FFFF4AF65C1 mov eax, ds:(jpt_7FFFF4AF65D2 - 7FFFF4AB0000h)[r9+r8*4]
.text:00007FFFF4AF65C9 add r9, rax ; Add
.text:00007FFFF4AF65CC add rcx, r8 ; Add
.text:00007FFFF4AF65CF mov rax, r11
.text:00007FFFF4AF65D2 jmp r9 ; switch jump
.text:00007FFFF4AF65D2 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65D5 jpt_7FFFF4AF65D2 dd offset msetTab00 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 ; DATA XREF: CompressPacket+181↑r
.text:00007FFFF4AF65D5 dd offset msetTab01 - 7FFFF4AB0000h ; jump table for switch statement
.text:00007FFFF4AF65D5 dd offset msetTab02 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab03 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab04 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab05 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab06 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab07 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab08 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab09 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab10 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab11 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab12 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab13 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab14 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab15 - 7FFFF4AB0000h
.text:00007FFFF4AF6615 align 20h
.text:00007FFFF4AF6620
.text:00007FFFF4AF6620 msetTab15: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6620 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6620 mov [rcx-0Fh], rdx ; jumptable 00007FFFF4AF65D2 case 15
.text:00007FFFF4AF6624
.text:00007FFFF4AF6624 msetTab07: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6624 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6624 mov [rcx-7], edx ; jumptable 00007FFFF4AF65D2 case 7
.text:00007FFFF4AF6627
.text:00007FFFF4AF6627 msetTab03: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6627 ; CompressPacket+1F3↓j
.text:00007FFFF4AF6627 ; DATA XREF: ...
.text:00007FFFF4AF6627 mov [rcx-3], dx ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF662B
.text:00007FFFF4AF662B msetTab01: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662B ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662B mov [rcx-1], dl ; jumptable 00007FFFF4AF65D2 case 1
.text:00007FFFF4AF662E
.text:00007FFFF4AF662E msetTab00: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662E ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662E retn ; jumptable 00007FFFF4AF65D2 case 0
.text:00007FFFF4AF662F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF662F
.text:00007FFFF4AF662F msetTab11: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662F ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662F mov [rcx-0Bh], rdx ; jumptable 00007FFFF4AF65D2 case 11
.text:00007FFFF4AF6633 jmp short msetTab03 ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF6635 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6635
.text:00007FFFF4AF6635 msetTab14: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6635 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6635 mov [rcx-0Eh], rdx ; jumptable 00007FFFF4AF65D2 case 14
.text:00007FFFF4AF6639
.text:00007FFFF4AF6639 msetTab06: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6639 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6639 mov [rcx-6], edx ; jumptable 00007FFFF4AF65D2 case 6
.text:00007FFFF4AF663C
.text:00007FFFF4AF663C msetTab02: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF663C ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF663C mov [rcx-2], dx ; jumptable 00007FFFF4AF65D2 case 2
.text:00007FFFF4AF6640 retn ; Return Near from Procedure
.text:00007FFFF4AF6641 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6641
.text:00007FFFF4AF6641 msetTab13: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6641 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6641 mov [rcx-0Dh], rdx ; jumptable 00007FFFF4AF65D2 case 13
.text:00007FFFF4AF6645
.text:00007FFFF4AF6645 msetTab05: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6645 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6645 mov [rcx-5], edx ; jumptable 00007FFFF4AF65D2 case 5
.text:00007FFFF4AF6648 mov [rcx-1], dl
.text:00007FFFF4AF664B retn ; Return Near from Procedure
.text:00007FFFF4AF664C ; ---------------------------------------------------------------------------
.text:00007FFFF4AF664C
.text:00007FFFF4AF664C msetTab12: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF664C ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF664C mov [rcx-0Ch], rdx ; jumptable 00007FFFF4AF65D2 case 12
.text:00007FFFF4AF6650
.text:00007FFFF4AF6650 msetTab04: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6650 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6650 mov [rcx-4], edx ; jumptable 00007FFFF4AF65D2 case 4
.text:00007FFFF4AF6653 retn ; Return Near from Procedure
.text:00007FFFF4AF6654 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6654
.text:00007FFFF4AF6654 msetTab10: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6654 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6654 mov [rcx-0Ah], rdx ; jumptable 00007FFFF4AF65D2 case 10
.text:00007FFFF4AF6658 mov [rcx-2], dx
.text:00007FFFF4AF665C retn ; Return Near from Procedure
.text:00007FFFF4AF665D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF665D
.text:00007FFFF4AF665D msetTab09: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF665D ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF665D mov [rcx-9], rdx ; jumptable 00007FFFF4AF65D2 case 9
.text:00007FFFF4AF6661 mov [rcx-1], dl
.text:00007FFFF4AF6664 retn ; Return Near from Procedure
.text:00007FFFF4AF6665 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6665
.text:00007FFFF4AF6665 msetTab08: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6665 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6665 mov [rcx-8], rdx ; jumptable 00007FFFF4AF65D2 case 8
.text:00007FFFF4AF6669 retn ; Return Near from Procedure
.text:00007FFFF4AF6669 CompressPacket endp
A common use case is unpacking with zeros to widen 8-bit numbers to 16-bit (with zero-extension), like SSE4.1 pmovzxbw
. Or especially to unpack both low and high halves of a 16-byte register to get two vectors of 8x 16-bit elements each. That's kind of the only use case where the "unpack" name makes sense, and packuswb
is its inverse, combining 2 registers down to 1. (Or packsswb
for signed saturation.)
The "unpack" name is otherwise very strange; it's just a shuffle that interleaves elements from two registers. ARM NEON has a similar shuffle whose mnemonic is "zip".
In your case, it's part of broadcasting a byte into an XMM register, as part of memset. i.e. it's part of what _mm_set_epi8(x)
does.
Multiply with 0x0101010101010101
repeats a byte 8 times in a 64-bit integer. This lets you use scalar-integer stores for an odd 8 bytes (not a multiple of 16), like the mov [r11], rdx
store.
Given this 8-byte broadcast as an input (via movaq
), only one SIMD shuffle is needed. Duplicating the low 8 with punpcklqdq
would have been my choice because 8-byte granularity shuffles are more efficient on really old CPUs like Core 2. But interleaving the byte with each other is equivalent because they're all the same anyway, resulting in an XMM register that holds 16 copies of the same byte.
In fact, SSE2 can broadcast a dword with one instruction: pshufd xmm0, xmm0, 0
, so if not for wanting an 8-byte scalar, it could have just used imul edx, r9d, 0x01010101
.
Implementing memset with 8-byte mov
and 16-byte movups
stores of course needs this as an input, if it's using that strategy instead of the rep stosb
strategy.
With SSSE3 you can broadcast a single byte directly with one pshufb
with an all-zero vector (without needing a multiply first) selecting the 0th element of the source for every element of the destination. Or with AVX2 vpbroadcastb
. Skipping the integer multiply step would be fine; you can use movq [mem], xmm0
8-byte stores from xmm0 instead of from RDX.
With a byte at the bottom of an xmm register and garbage in the other elements (i.e. if you didn't use imul
), 2x punpcklbw
+ pshufd
can broadcast with just SSE2. Or of course punpcklbw xmm0,xmm0
/ punpcklwd xmm0,xmm0
as the first 2 shuffles. Or punpcklbw xmm0,xmm0
/ pshuflw xmm0,xmm0, 0
/ punpcklqdq xmm0,xmm0
.