How to add an AVX2 vector horizontally 3 by 3?

I have a __m256i vector containing 16x16-bit elements.I want to apply a three adjacent horizontal addition on it. In scalar mode I use the following code:

unsigned short int temp[16];
__m256i sum_v;//has some values. 16 elements of 16-bit vector.   | 0 | x15 | x14 | x13 | ... | x3 | x2 | x1 |
_mm256_store_si256((__m256i *)&temp[0], sum_v);
output1 = (temp[0] + temp[1] + temp[2]);
output2 = (temp[3] + temp[4] + temp[5]);
output3 = (temp[6] + temp[7] + temp[8]);
output4 = (temp[9] + temp[10] + temp[11]);
output5 = (temp[12] + temp[13] + temp[14]); 
// Dont want the 15th element

Because this part is placed in the bottleneck section of my program, I decided to vectorize is using AVX2. Dreamy I can add them like the following pseudo:

sum_v                                     //|  0  | x15 | x14 | x13 |...| x10 |...| x7 |...| x4 |...| x1 | 
sum_v1 = sum_v >> 1*16                    //|  0  |  0  | x15 | x14 |...| x11 |...| x8 |...| x5 |...| x2 |  
sum_v2 = sumv >> 2*16                     //|  0  |  0  |  0  | x15 |...| x12 |...| x9 |...| x6 |...| x3 |
result_vec = add_epi16 (sum_v,sum_v1,sum_v2)

//then I should extact the result_vec to outputs

Adding them vertically will provide the answer. But unfortunately, AVX2 has not a shift operation for 256 bits while the 256-bit register is viewed as two 128-bit lanes. I should use permutation for this case. But I could not find an appropriate permut, shuffle, etc. to do this. Is there any suggestion for this implementation that should be as fast as possible.

Using gcc, linux mint, intrinsics, skylake.

Solution

You can do this with two adds and only 2 "shuffles": _mm256_bsrli_epi128 shifts in zeros at positions that are not of interest to the answer. For _mm256_permutevar8x32_epi32 we choose a permutation that duplicates the upper 32 bits, but these bits are also not relevant for the answer.

#include <stdio.h>
#include <x86intrin.h>
/*  gcc -O3 -Wall -m64 -march=haswell hor_sum3x3.c   */
int print_vec_short(__m256i x);
int print_12_9_6_3_0_short(__m256i x);

int main() {
   short x[16];

   for(int i=0; i<16; i++) x[i] = i+1; x[15] = 0;

   __m256i t0   = _mm256_loadu_si256((__m256i*)x);                              


   __m256i t1   = _mm256_bsrli_epi128(t0,2);             /* Shift 128 bit lanes in t0 right by 2 bytes while shifting in zeros. Fortunately the zeros are in the positions that we don't need */ 
   __m256i t2   = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(7,7,6,5,4,3,2,1)); /* Shift right by 4 bytes     */
   __m256i sum  = _mm256_add_epi16(_mm256_add_epi16(t0,t1),t2);

   printf("t0  = ");print_vec_short(t0);
   printf("t1  = ");print_vec_short(t1);
   printf("t2  = ");print_vec_short(t2);
   printf("sum = ");print_vec_short(sum);

   printf("\nvector elements of interest: columns 12, 9, 6, 3, 0:\n");
   printf("t0[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t0);
   printf("t1[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t1);
   printf("t2[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t2);
   printf("sum[12, 9, 6, 3, 0] = ");print_12_9_6_3_0_short(sum);
   return 0;
}


int print_vec_short(__m256i x){
   short int v[16];
   _mm256_storeu_si256((__m256i *)v,x);
   printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi  | %4hi %4hi %4hi %4hi \n",
          v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
   return 0;
}

int print_12_9_6_3_0_short(__m256i x){
   short int v[16];
   _mm256_storeu_si256((__m256i *)v,x);
   printf("%4hi %4hi %4hi %4hi %4hi  \n",v[12],v[9],v[6],v[3],v[0]);
   return 0;
}

The output is:

$ ./a.out
t0  =    0   15   14   13 |   12   11   10    9 |    8    7    6    5  |    4    3    2    1 
t1  =    0    0   15   14 |   13   12   11   10 |    0    8    7    6  |    5    4    3    2 
t2  =    0   15    0   15 |   14   13   12   11 |   10    9    8    7  |    6    5    4    3 
sum =    0   30   29   42 |   39   36   33   30 |   18   24   21   18  |   15   12    9    6 

vector elements of interest: columns 12, 9, 6, 3, 0:
t0[12, 9, 6, 3, 0]  =   13   10    7    4    1  
t1[12, 9, 6, 3, 0]  =   14   11    8    5    2  
t2[12, 9, 6, 3, 0]  =   15   12    9    6    3  
sum[12, 9, 6, 3, 0] =   42   33   24   15    6