Auto-vectorize shuffle instruction

I'm trying to make the compiler generate the (v)pshufd instruction (or equivalent) via auto-vectorization. It's surprisingly difficult.

For example, presuming a vector of 4 uint32 values, the transformation : A|B|C|D => A|A|C|C is supposed to be achieved using a single instruction (corresponding intrinsic : _mm_shuffle_epi32()).

Trying to express the same transformation using only normal operations, I can write for example :

    for (i=0; i<4; i+=2)
        v32x4[i] = v32x4[i+1];

The compiler seems unable to make a good transformation, generating instead in a mix of scalar and vector code of more than a dozen instructions. Unrolling manually produces an even worse outcome.

Sometimes, a little detail get in the way, preventing the compiler to translate correctly. For example, the nb of elements in the array should be a clear power of 2, pointers to table should be guaranteed to not alias, alignment should be expressed explicitly, etc. In this case, I haven't found any similar reason, and I'm still stuck with manual intrinsics to generate a reasonable assembly.

Is there a way to generate the (v)pshufd instruction using only normal code and relying on compiler's auto-vectorizer ?

Solution

(Update: new answer since 2019-02-07.)

It is possible to make the compiler generate the (v)pshufd instruction, even without gcc's vector extensions which I used in a previous answer to this question. The following examples give an impression of the possibilities. These examples are compiled with gcc 8.2 and clang 7.

Example 1

#include<stdint.h>
/*                                       vectorizes     */
/*   gcc -m64 -O3  -march=nehalem        Yes            */
/*   gcc -m64 -O3  -march=skylake        Yes            */
/*   clang -m64 -O3  -march=nehalem      No             */
/*   clang -m64 -O3  -march=skylake      No             */
void shuff1(int32_t* restrict a, int32_t* restrict b, int32_t n){
    /* this line is optional */  a = (int32_t*)__builtin_assume_aligned(a, 16); b = (int32_t*)__builtin_assume_aligned(b, 16);
    for (int32_t i = 0; i < n; i=i+4) {
        b[i+0] = a[i+0];
        b[i+1] = a[i+0];
        b[i+2] = a[i+2];
        b[i+3] = a[i+2];
    }
}


/*                                       vectorizes     */
/*   gcc -m64 -O3  -march=nehalem        Yes            */
/*   gcc -m64 -O3  -march=skylake        Yes            */
/*   clang -m64 -O3  -march=nehalem      Yes            */
/*   clang -m64 -O3  -march=skylake      Yes            */
void shuff2(int32_t* restrict a, int32_t* restrict b, int32_t n){
    /* this line is optional */  a = (int32_t*)__builtin_assume_aligned(a, 16); b = (int32_t*)__builtin_assume_aligned(b, 16);
    for (int32_t i = 0; i < n; i=i+4) {
        b[i+0] = a[i+1];
        b[i+1] = a[i+2];
        b[i+2] = a[i+3];
        b[i+3] = a[i+0];
    }
}

Surprisingly clang only vectorizes permutations in the mathematical sense, not general shuffles. With gcc -m64 -O3 -march=nehalem, the main loop of shuff1 becomes:

.L3:
  add edx, 1
  pshufd xmm0, XMMWORD PTR [rdi+rax], 160
  movaps XMMWORD PTR [rsi+rax], xmm0
  add rax, 16
  cmp edx, ecx
  jb .L3

Example 2

/*                                       vectorizes     */
/*   gcc -m64 -O3  -march=nehalem        No             */
/*   gcc -m64 -O3  -march=skylake        No             */
/*   clang -m64 -O3  -march=nehalem      No             */
/*   clang -m64 -O3  -march=skylake      No             */
void shuff3(int32_t* restrict a, int32_t* restrict b){
    /* this line is optional */ a = (int32_t*)__builtin_assume_aligned(a, 16); b = (int32_t*)__builtin_assume_aligned(b, 16);
    b[0] = a[0];
    b[1] = a[0];
    b[2] = a[2];
    b[3] = a[2];
}


/*                                       vectorizes     */
/*   gcc -m64 -O3  -march=nehalem        Yes            */
/*   gcc -m64 -O3  -march=skylake        Yes            */
/*   clang -m64 -O3  -march=nehalem      Yes            */
/*   clang -m64 -O3  -march=skylake      Yes            */
void shuff4(int32_t* restrict a, int32_t* restrict b){
    /* this line is optional */ a = (int32_t*)__builtin_assume_aligned(a, 16); b = (int32_t*)__builtin_assume_aligned(b, 16);
    b[0] = a[1];
    b[1] = a[2];
    b[2] = a[3];
    b[3] = a[0];
}

The assembly with gcc -m64 -O3 -march=skylake:

shuff3:
  mov eax, DWORD PTR [rdi]
  mov DWORD PTR [rsi], eax
  mov DWORD PTR [rsi+4], eax
  mov eax, DWORD PTR [rdi+8]
  mov DWORD PTR [rsi+8], eax
  mov DWORD PTR [rsi+12], eax
  ret
shuff4:
  vpshufd xmm0, XMMWORD PTR [rdi], 57
  vmovaps XMMWORD PTR [rsi], xmm0
  ret

Again the results of the (0,3,2,1) permutation differs essentially from the (2,2,0,0) shuffle case.

Example 3

/*                                       vectorizes     */
/*   gcc -m64 -O3  -march=nehalem        Yes            */
/*   gcc -m64 -O3  -march=skylake        Yes            */
/*   clang -m64 -O3  -march=nehalem      No             */
/*   clang -m64 -O3  -march=skylake      No             */
void shuff5(int32_t* restrict a, int32_t* restrict b, int32_t n){
    /* this line is optional */ a = (int32_t*)__builtin_assume_aligned(a, 32); b = (int32_t*)__builtin_assume_aligned(b, 32);
    for (int32_t i = 0; i < n; i=i+8) {
        b[i+0] = a[i+2];
        b[i+1] = a[i+7];
        b[i+2] = a[i+7];
        b[i+3] = a[i+7];
        b[i+4] = a[i+0];
        b[i+5] = a[i+1];
        b[i+6] = a[i+5];
        b[i+7] = a[i+4];
    }
}


/*                                       vectorizes     */
/*   gcc -m64 -O3  -march=nehalem        Yes            */
/*   gcc -m64 -O3  -march=skylake        Yes            */
/*   clang -m64 -O3  -march=nehalem      No             */
/*   clang -m64 -O3  -march=skylake      No             */
void shuff6(int32_t* restrict a, int32_t* restrict b, int32_t n){
    /* this line is optional */ a = (int32_t*)__builtin_assume_aligned(a, 32); b = (int32_t*)__builtin_assume_aligned(b, 32);
    for (int32_t i = 0; i < n; i=i+8) {
        b[i+0] = a[i+0];
        b[i+1] = a[i+0];
        b[i+2] = a[i+2];
        b[i+3] = a[i+2];
        b[i+4] = a[i+4];
        b[i+5] = a[i+4];
        b[i+6] = a[i+6];
        b[i+7] = a[i+6];
    }
}

WIth gcc -m64 -O3 -march=skylake the main loop of shuff5 contains the lane crossing vpermd shuffle instruction, which is quite impressive, I think. Function shuff6 leads to the non lane crossing vpshufd ymm0, mem instruction, perfect.

Example 4

The assembly of shuff5 becomes quite messy if we replace b[i+5] = a[i+1]; by b[i+5] = 0;. Nevertheless the loop was vectorized. See also this Godbolt link for all the examples discussed in this answer.

If arrays a and b are 16 (or 32) byte aligned, then we can use a = (int32_t*)__builtin_assume_aligned(a, 16); b = (int32_t*)__builtin_assume_aligned(b, 16); (or 32 instead of 16). This sometimes improves the assembly code generation a bit.