Search code examples
assemblycompressionssedisassemblymemset

Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?


  1. Which classes of algorithms could be using punpcklbw ?

  2. In particular, what is punpcklbw xmm0, xmm0doing ?

  3. And yet, what is maskedPow2_Value useful for ?

    maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val; // Val is int maskedPow2_Value = 0x101010101010101i64 * maskedValue;

(or mov r9, 101010101010101h; imul rdx, r9; twice)

A complete example (the function is named CompressPacket but it may be misleading), asresult of decompilation by IDA:

void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
{
  __int64 maskedPow2_Value; // rdx
  unsigned int v5; // ecx
  __int64 *bufferOut; // rcx
  size_t size_; // r9
  size_t i; // r9
  size_t size__; // r9
  size_t counter; // r8
  size_t j; // r9
  void *result; // rax
  __m128i v13; // xmm0
  __int64 lsb4; // rax
  size_t counter1; // r9
  size_t k; // r9
  size_t lsb4_; // r8
  __int64 maskedValue; // rdx

  *(_QWORD *)&Val = (unsigned __int8)Val;
  maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val;
  bufferOut = (__int64 *)((char *)Dst + Size);
  result = Dst;
  switch ( Size )
  {
    case 0ui64:
      return result;
    case 1ui64:
      goto LBL_1_F;
    case 2ui64:
      goto LBL_2_E;
    case 3ui64:
      goto LBL_3_F;
    case 4ui64:
      goto LBL_4_C;
    case 5ui64:
      goto LBL_5_D;
    case 6ui64:
      goto LBL_6_E;
    case 7ui64:
      goto LBL_7_F;
    case 8ui64:
      *(bufferOut - 1) = maskedValue;
      return result;
    case 9ui64:
      *(__int64 *)((char *)bufferOut - 9) = maskedValue;
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    case 0xAui64:
      *(__int64 *)((char *)bufferOut - 10) = maskedValue;
      *((_WORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xBui64:
      *(__int64 *)((char *)bufferOut - 11) = maskedValue;
      goto LBL_3_F;
    case 0xCui64:
      *(__int64 *)((char *)bufferOut - 12) = maskedValue;
LBL_4_C:
      *((_DWORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xDui64:
      *(__int64 *)((char *)bufferOut - 13) = maskedValue;
LBL_5_D:
      *(_DWORD *)((char *)bufferOut - 5) = maskedValue;
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    case 0xEui64:
      *(__int64 *)((char *)bufferOut - 14) = maskedValue;
LBL_6_E:
      *(_DWORD *)((char *)bufferOut - 6) = maskedValue;
LBL_2_E:
      *((_WORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xFui64:
      *(__int64 *)((char *)bufferOut - 15) = maskedValue;
LBL_7_F:
      *(_DWORD *)((char *)bufferOut - 7) = maskedValue;
LBL_3_F:
      *(_WORD *)((char *)bufferOut - 3) = maskedValue;
LBL_1_F:
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    default:
      if ( _bittest(dword_7FFFF4B237D8, 1u) )
      {
        memset(bufferOut, maskedValue, Size);
        return Dst;
      }
      maskedPow2_Value = 0x101010101010101i64 * maskedValue;
      if ( !_bittest(dword_7FFFF4B237D8, 2u) )
      {
        if ( Size >= 0x40 )
        {
          v5 = -(int)bufferOut & 7;
          if ( v5 )
          {
            Size -= v5;
            *(_QWORD *)Dst = maskedPow2_Value;
          }
          bufferOut = (__int64 *)((char *)Dst + v5);
          size_ = Size;
          Size &= 0x3Fu;
          for ( i = size_ >> 6; i; *(bufferOut - 1) = maskedPow2_Value )
          {
            *bufferOut = maskedPow2_Value;
            bufferOut[1] = maskedPow2_Value;
            bufferOut[2] = maskedPow2_Value;
            bufferOut += 8;
            *(bufferOut - 5) = maskedPow2_Value;
            *(bufferOut - 4) = maskedPow2_Value;
            --i;
            *(bufferOut - 3) = maskedPow2_Value;
            *(bufferOut - 2) = maskedPow2_Value;
          }
        }
        size__ = Size;
        counter = Size & 7;
        for ( j = size__ >> 3; j; --j )
          *bufferOut++ = maskedPow2_Value;
        for ( ; counter; --counter )
        {
          *(_BYTE *)bufferOut = maskedPow2_Value;
          bufferOut = (__int64 *)((char *)bufferOut + 1);
        }
        return Dst;
      }
      v13 = _mm_unpacklo_epi8((__m128i)(unsigned __int64)maskedPow2_Value, (__m128i)(unsigned __int64)maskedPow2_Value);
      if ( ((unsigned __int8)bufferOut & 0xF) != 0 )
      {
        *(__m128i *)bufferOut = v13;
        lsb4 = (unsigned __int8)bufferOut & 0xF;
        bufferOut = (__int64 *)((char *)bufferOut - lsb4 + 16);
        Size = lsb4 + Size - 16;
      }
      counter1 = Size >> 7;
      if ( Size >> 7 )
      {
        do
        {
          *(__m128i *)bufferOut = v13;
          *((__m128i *)bufferOut + 1) = v13;
          bufferOut += 16;
          *((__m128i *)bufferOut - 6) = v13;
          *((__m128i *)bufferOut - 5) = v13;
          --counter1;
          *((__m128i *)bufferOut - 4) = v13;
          *((__m128i *)bufferOut - 3) = v13;
          *((__m128i *)bufferOut - 2) = v13;
          *((__m128i *)bufferOut - 1) = v13;
        }
        while ( counter1 );
        Size &= 0x7Fu;
      }
      for ( k = Size >> 4; k; --k )
      {
        *(__m128i *)bufferOut = v13;
        bufferOut += 2;
      }
      lsb4_ = Size & 0xF;
      if ( lsb4_ )
        *(__m128i *)((char *)bufferOut + lsb4_ - 16) = v13;
      return Dst;
  }
}

and the disassembly, by IDA too:

.text:00007FFFF4AF6440 ; void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
.text:00007FFFF4AF6440 CompressPacket  proc near               ; CODE XREF: j_memset↑j
.text:00007FFFF4AF6440                                         ; Concurrency::details::ResourceManager::CreateAllocatedNodeData(void)+49↑p ...
.text:00007FFFF4AF6440                 mov     r11, rcx
.text:00007FFFF4AF6443                 movzx   edx, dl         ; Move with Zero-Extend
.text:00007FFFF4AF6446                 cmp     r8, 10h         ; switch 16 cases
.text:00007FFFF4AF644A                 jb      SetBytes15      ; Jump if Below (CF=1)
.text:00007FFFF4AF6450
.text:00007FFFF4AF6450 def_7FFFF4AF65D2:                       ; jumptable 00007FFFF4AF65D2 default case
.text:00007FFFF4AF6450                 bt      cs:dword_7FFFF4B237D8, 1
.text:00007FFFF4AF6458                 jnb     short mset05    ; Jump if Not Below (CF=0)
.text:00007FFFF4AF645A                 push    rdi
.text:00007FFFF4AF645B                 mov     rdi, rcx
.text:00007FFFF4AF645E                 mov     eax, edx
.text:00007FFFF4AF6460                 mov     rcx, r8
.text:00007FFFF4AF6463                 rep stosb               ; Store String
.text:00007FFFF4AF6465                 pop     rdi
.text:00007FFFF4AF6466                 jmp     short mset60    ; Jump
.text:00007FFFF4AF6468 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6468
.text:00007FFFF4AF6468 mset05:                                 ; CODE XREF: CompressPacket+18↑j
.text:00007FFFF4AF6468                 mov     r9, 101010101010101h
.text:00007FFFF4AF6472                 imul    rdx, r9         ; Signed Multiply
.text:00007FFFF4AF6476                 bt      cs:dword_7FFFF4B237D8, 2 ; Bit Test
.text:00007FFFF4AF647E                 jb      msetxmm10       ; Jump if Below (CF=1)
.text:00007FFFF4AF6484                 cmp     r8, 40h ; '@'   ; Compare Two Operands
.text:00007FFFF4AF6488                 jb      short mset20    ; Jump if Below (CF=1)
.text:00007FFFF4AF648A                 neg     rcx             ; Two's Complement Negation
.text:00007FFFF4AF648D                 and     ecx, 7          ; Logical AND
.text:00007FFFF4AF6490                 jz      short mset10    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6492                 sub     r8, rcx         ; Integer Subtraction
.text:00007FFFF4AF6495                 mov     [r11], rdx
.text:00007FFFF4AF6498
.text:00007FFFF4AF6498 mset10:                                 ; CODE XREF: CompressPacket+50↑j
.text:00007FFFF4AF6498                 add     rcx, r11        ; Add
.text:00007FFFF4AF649B                 mov     r9, r8
.text:00007FFFF4AF649E                 and     r8, 3Fh         ; Logical AND
.text:00007FFFF4AF64A2                 shr     r9, 6           ; Shift Logical Right
.text:00007FFFF4AF64A6                 jnz     short mset80    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64A8
.text:00007FFFF4AF64A8 mset20:                                 ; CODE XREF: CompressPacket+48↑j
.text:00007FFFF4AF64A8                                         ; CompressPacket+CF↓j
.text:00007FFFF4AF64A8                 mov     r9, r8
.text:00007FFFF4AF64AB                 and     r8, 7           ; Logical AND
.text:00007FFFF4AF64AF                 shr     r9, 3           ; Shift Logical Right
.text:00007FFFF4AF64B3                 jz      short mset40    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64B5                 db      66h, 66h
.text:00007FFFF4AF64B5                 xchg    ax, ax          ; Exchange Register/Memory with Register
.text:00007FFFF4AF64B9                 nop                     ; No Operation
.text:00007FFFF4AF64BA
.text:00007FFFF4AF64BA mset30:                                 ; CODE XREF: CompressPacket+84↓j
.text:00007FFFF4AF64BA                 mov     [rcx], rdx
.text:00007FFFF4AF64BD                 add     rcx, 8          ; Add
.text:00007FFFF4AF64C1                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF64C4                 jnz     short mset30    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64C6
.text:00007FFFF4AF64C6 mset40:                                 ; CODE XREF: CompressPacket+73↑j
.text:00007FFFF4AF64C6                 test    r8, r8          ; Logical Compare
.text:00007FFFF4AF64C9                 jz      short mset60    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64CB
.text:00007FFFF4AF64CB mset50:                                 ; CODE XREF: CompressPacket+93↓j
.text:00007FFFF4AF64CB                 mov     [rcx], dl
.text:00007FFFF4AF64CD                 inc     rcx             ; Increment by 1
.text:00007FFFF4AF64D0                 dec     r8              ; Decrement by 1
.text:00007FFFF4AF64D3                 jnz     short mset50    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64D5
.text:00007FFFF4AF64D5 mset60:                                 ; CODE XREF: CompressPacket+26↑j
.text:00007FFFF4AF64D5                                         ; CompressPacket+89↑j
.text:00007FFFF4AF64D5                 mov     rax, r11
.text:00007FFFF4AF64D8                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF64D8 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64D9                 db 0Fh, 1Fh, 80h, 4 dup(0)
.text:00007FFFF4AF64E0                 db 3 dup(66h), 90h
.text:00007FFFF4AF64E4                 db 2 dup(66h), 90h
.text:00007FFFF4AF64E7 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64E7
.text:00007FFFF4AF64E7 mset80:                                 ; CODE XREF: CompressPacket+66↑j
.text:00007FFFF4AF64E7                                         ; CompressPacket+CD↓j
.text:00007FFFF4AF64E7                 mov     [rcx], rdx
.text:00007FFFF4AF64EA                 mov     [rcx+8], rdx
.text:00007FFFF4AF64EE                 mov     [rcx+10h], rdx
.text:00007FFFF4AF64F2                 add     rcx, 40h ; '@'  ; Add
.text:00007FFFF4AF64F6                 mov     [rcx-28h], rdx
.text:00007FFFF4AF64FA                 mov     [rcx-20h], rdx
.text:00007FFFF4AF64FE                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF6501                 mov     [rcx-18h], rdx
.text:00007FFFF4AF6505                 mov     [rcx-10h], rdx
.text:00007FFFF4AF6509                 mov     [rcx-8], rdx
.text:00007FFFF4AF650D                 jnz     short mset80    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF650F                 jmp     short mset20    ; Jump
.text:00007FFFF4AF650F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6511                 align 20h
.text:00007FFFF4AF6520
.text:00007FFFF4AF6520 msetxmm10:                              ; CODE XREF: CompressPacket+3E↑j
.text:00007FFFF4AF6520                 movq    xmm0, rdx       ; Move 64 bits
.text:00007FFFF4AF6525                 punpcklbw xmm0, xmm0    ; Unpack Low Packed Data (Byte->Word)
.text:00007FFFF4AF6529                 test    cl, 0Fh         ; Logical Compare
.text:00007FFFF4AF652C                 jz      short msetxmm20 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF652E                 movups  xmmword ptr [rcx], xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF6531                 mov     rax, rcx
.text:00007FFFF4AF6534                 and     rax, 0Fh        ; Logical AND
.text:00007FFFF4AF6538                 add     rcx, 10h        ; Add
.text:00007FFFF4AF653C                 sub     rcx, rax        ; Integer Subtraction
.text:00007FFFF4AF653F                 lea     r8, [rax+r8-10h] ; Load Effective Address
.text:00007FFFF4AF6544
.text:00007FFFF4AF6544 msetxmm20:                              ; CODE XREF: CompressPacket+EC↑j
.text:00007FFFF4AF6544                 mov     r9, r8
.text:00007FFFF4AF6547                 shr     r9, 7           ; Shift Logical Right
.text:00007FFFF4AF654B                 jz      short msetxmm40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF654D                 jmp     short msetxmm30 ; Jump
.text:00007FFFF4AF654D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF654F                 align 10h
.text:00007FFFF4AF6550
.text:00007FFFF4AF6550 msetxmm30:                              ; CODE XREF: CompressPacket+10D↑j
.text:00007FFFF4AF6550                                         ; CompressPacket+139↓j
.text:00007FFFF4AF6550                 movaps  xmmword ptr [rcx], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6553                 movaps  xmmword ptr [rcx+10h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6557                 add     rcx, 80h ; '€'  ; Add
.text:00007FFFF4AF655E                 movaps  xmmword ptr [rcx-60h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6562                 movaps  xmmword ptr [rcx-50h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6566                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF6569                 movaps  xmmword ptr [rcx-40h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF656D                 movaps  xmmword ptr [rcx-30h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6571                 movaps  xmmword ptr [rcx-20h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6575                 movaps  xmmword ptr [rcx-10h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6579                 jnz     short msetxmm30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF657B                 and     r8, 7Fh         ; Logical AND
.text:00007FFFF4AF657F
.text:00007FFFF4AF657F msetxmm40:                              ; CODE XREF: CompressPacket+10B↑j
.text:00007FFFF4AF657F                 mov     r9, r8
.text:00007FFFF4AF6582                 shr     r9, 4           ; Shift Logical Right
.text:00007FFFF4AF6586                 jz      short msetxmm60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6588                 nop     dword ptr [rax+rax+00000000h] ; No Operation
.text:00007FFFF4AF6590
.text:00007FFFF4AF6590 msetxmm50:                              ; CODE XREF: CompressPacket+15A↓j
.text:00007FFFF4AF6590                 movaps  xmmword ptr [rcx], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6593                 add     rcx, 10h        ; Add
.text:00007FFFF4AF6597                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF659A                 jnz     short msetxmm50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF659C
.text:00007FFFF4AF659C msetxmm60:                              ; CODE XREF: CompressPacket+146↑j
.text:00007FFFF4AF659C                 and     r8, 0Fh         ; Logical AND
.text:00007FFFF4AF65A0                 jz      short msetxmm70 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF65A2                 movups  xmmword ptr [r8+rcx-10h], xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF65A8
.text:00007FFFF4AF65A8 msetxmm70:                              ; CODE XREF: CompressPacket+160↑j
.text:00007FFFF4AF65A8                 mov     rax, r11
.text:00007FFFF4AF65AB                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF65AC ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65AC
.text:00007FFFF4AF65AC SetBytes15:                             ; CODE XREF: CompressPacket+A↑j
.text:00007FFFF4AF65AC                 mov     r9, 101010101010101h
.text:00007FFFF4AF65B6                 imul    rdx, r9         ; Signed Multiply
.text:00007FFFF4AF65BA                 lea     r9, cs:7FFFF4AB0000h ; Load Effective Address
.text:00007FFFF4AF65C1                 mov     eax, ds:(jpt_7FFFF4AF65D2 - 7FFFF4AB0000h)[r9+r8*4]
.text:00007FFFF4AF65C9                 add     r9, rax         ; Add
.text:00007FFFF4AF65CC                 add     rcx, r8         ; Add
.text:00007FFFF4AF65CF                 mov     rax, r11
.text:00007FFFF4AF65D2                 jmp     r9              ; switch jump
.text:00007FFFF4AF65D2 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65D5 jpt_7FFFF4AF65D2 dd offset msetTab00 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                                         ; DATA XREF: CompressPacket+181↑r
.text:00007FFFF4AF65D5                 dd offset msetTab01 - 7FFFF4AB0000h ; jump table for switch statement
.text:00007FFFF4AF65D5                 dd offset msetTab02 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab03 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab04 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab05 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab06 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab07 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab08 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab09 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab10 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab11 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab12 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab13 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab14 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab15 - 7FFFF4AB0000h
.text:00007FFFF4AF6615                 align 20h
.text:00007FFFF4AF6620
.text:00007FFFF4AF6620 msetTab15:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6620                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6620                 mov     [rcx-0Fh], rdx  ; jumptable 00007FFFF4AF65D2 case 15
.text:00007FFFF4AF6624
.text:00007FFFF4AF6624 msetTab07:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6624                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6624                 mov     [rcx-7], edx    ; jumptable 00007FFFF4AF65D2 case 7
.text:00007FFFF4AF6627
.text:00007FFFF4AF6627 msetTab03:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6627                                         ; CompressPacket+1F3↓j
.text:00007FFFF4AF6627                                         ; DATA XREF: ...
.text:00007FFFF4AF6627                 mov     [rcx-3], dx     ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF662B
.text:00007FFFF4AF662B msetTab01:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662B                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662B                 mov     [rcx-1], dl     ; jumptable 00007FFFF4AF65D2 case 1
.text:00007FFFF4AF662E
.text:00007FFFF4AF662E msetTab00:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662E                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662E                 retn                    ; jumptable 00007FFFF4AF65D2 case 0
.text:00007FFFF4AF662F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF662F
.text:00007FFFF4AF662F msetTab11:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662F                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662F                 mov     [rcx-0Bh], rdx  ; jumptable 00007FFFF4AF65D2 case 11
.text:00007FFFF4AF6633                 jmp     short msetTab03 ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF6635 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6635
.text:00007FFFF4AF6635 msetTab14:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6635                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6635                 mov     [rcx-0Eh], rdx  ; jumptable 00007FFFF4AF65D2 case 14
.text:00007FFFF4AF6639
.text:00007FFFF4AF6639 msetTab06:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6639                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6639                 mov     [rcx-6], edx    ; jumptable 00007FFFF4AF65D2 case 6
.text:00007FFFF4AF663C
.text:00007FFFF4AF663C msetTab02:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF663C                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF663C                 mov     [rcx-2], dx     ; jumptable 00007FFFF4AF65D2 case 2
.text:00007FFFF4AF6640                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6641 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6641
.text:00007FFFF4AF6641 msetTab13:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6641                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6641                 mov     [rcx-0Dh], rdx  ; jumptable 00007FFFF4AF65D2 case 13
.text:00007FFFF4AF6645
.text:00007FFFF4AF6645 msetTab05:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6645                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6645                 mov     [rcx-5], edx    ; jumptable 00007FFFF4AF65D2 case 5
.text:00007FFFF4AF6648                 mov     [rcx-1], dl
.text:00007FFFF4AF664B                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF664C ; ---------------------------------------------------------------------------
.text:00007FFFF4AF664C
.text:00007FFFF4AF664C msetTab12:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF664C                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF664C                 mov     [rcx-0Ch], rdx  ; jumptable 00007FFFF4AF65D2 case 12
.text:00007FFFF4AF6650
.text:00007FFFF4AF6650 msetTab04:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6650                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6650                 mov     [rcx-4], edx    ; jumptable 00007FFFF4AF65D2 case 4
.text:00007FFFF4AF6653                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6654 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6654
.text:00007FFFF4AF6654 msetTab10:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6654                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6654                 mov     [rcx-0Ah], rdx  ; jumptable 00007FFFF4AF65D2 case 10
.text:00007FFFF4AF6658                 mov     [rcx-2], dx
.text:00007FFFF4AF665C                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF665D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF665D
.text:00007FFFF4AF665D msetTab09:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF665D                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF665D                 mov     [rcx-9], rdx    ; jumptable 00007FFFF4AF65D2 case 9
.text:00007FFFF4AF6661                 mov     [rcx-1], dl
.text:00007FFFF4AF6664                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6665 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6665
.text:00007FFFF4AF6665 msetTab08:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6665                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6665                 mov     [rcx-8], rdx    ; jumptable 00007FFFF4AF65D2 case 8
.text:00007FFFF4AF6669                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6669 CompressPacket  endp

Solution

  • A common use case is unpacking with zeros to widen 8-bit numbers to 16-bit (with zero-extension), like SSE4.1 pmovzxbw. Or especially to unpack both low and high halves of a 16-byte register to get two vectors of 8x 16-bit elements each. That's kind of the only use case where the "unpack" name makes sense, and packuswb is its inverse, combining 2 registers down to 1. (Or packsswb for signed saturation.)

    The "unpack" name is otherwise very strange; it's just a shuffle that interleaves elements from two registers. ARM NEON has a similar shuffle whose mnemonic is "zip".


    In your case, it's part of broadcasting a byte into an XMM register, as part of memset. i.e. it's part of what _mm_set_epi8(x) does.

    Multiply with 0x0101010101010101 repeats a byte 8 times in a 64-bit integer. This lets you use scalar-integer stores for an odd 8 bytes (not a multiple of 16), like the mov [r11], rdx store.

    Given this 8-byte broadcast as an input (via movaq), only one SIMD shuffle is needed. Duplicating the low 8 with punpcklqdq would have been my choice because 8-byte granularity shuffles are more efficient on really old CPUs like Core 2. But interleaving the byte with each other is equivalent because they're all the same anyway, resulting in an XMM register that holds 16 copies of the same byte.

    In fact, SSE2 can broadcast a dword with one instruction: pshufd xmm0, xmm0, 0, so if not for wanting an 8-byte scalar, it could have just used imul edx, r9d, 0x01010101.

    Implementing memset with 8-byte mov and 16-byte movups stores of course needs this as an input, if it's using that strategy instead of the rep stosb strategy.

    With SSSE3 you can broadcast a single byte directly with one pshufb with an all-zero vector (without needing a multiply first) selecting the 0th element of the source for every element of the destination. Or with AVX2 vpbroadcastb. Skipping the integer multiply step would be fine; you can use movq [mem], xmm0 8-byte stores from xmm0 instead of from RDX.

    With a byte at the bottom of an xmm register and garbage in the other elements (i.e. if you didn't use imul), 2x punpcklbw + pshufd can broadcast with just SSE2. Or of course punpcklbw xmm0,xmm0 / punpcklwd xmm0,xmm0 as the first 2 shuffles. Or punpcklbw xmm0,xmm0 / pshuflw xmm0,xmm0, 0 / punpcklqdq xmm0,xmm0.