Search code examples
c++vectorx86ssesimd

Accumulate vector of integer with sse


I tried to change this code to handle std::vector<int>.

float accumulate(const std::vector<float>& v)
{
 // copy the length of v and a pointer to the data onto the local stack
 const size_t N = v.size();
 const float* p = (N > 0) ? &v.front() : NULL;

 __m128 mmSum = _mm_setzero_ps();
 size_t i = 0;

 // unrolled loop that adds up 4 elements at a time
 for(; i < ROUND_DOWN(N, 4); i+=4)
 {
  mmSum = _mm_add_ps(mmSum, _mm_loadu_ps(p + i));
 }

 // add up single values until all elements are covered
 for(; i < N; i++)
 {
  mmSum = _mm_add_ss(mmSum, _mm_load_ss(p + i));
 }

 // add up the four float values from mmSum into a single value and return
 mmSum = _mm_hadd_ps(mmSum, mmSum);
 mmSum = _mm_hadd_ps(mmSum, mmSum);
 return _mm_cvtss_f32(mmSum);
}

Ref: http://fastcpp.blogspot.com.au/2011/04/how-to-process-stl-vector-using-sse.html

I changed _mm_setzero_ps to _mm_setzero_si128, _mm_loadu_ps to mm_loadl_epi64 and _mm_add_ps to _mm_add_epi64.

I get this error:

error: cannot convert ‘const int*’ to ‘const __m128i* {aka const __vector(2) long long int*}’ for argument ‘1’ to ‘__m128i _mm_loadl_epi64(const __m128i*)’
         mmSum = _mm_add_epi64(mmSum, _mm_loadl_epi64(p + i + 0));

I am novice in this field. Is there any good source to learn these things?


Solution

  • Here is an int version which I just threw together:

    #include <iostream>
    #include <vector>
    
    #include <smmintrin.h>  // SSE4
    
    #define ROUND_DOWN(m, n) ((m) & ~((n) - 1))
    
    static int accumulate(const std::vector<int>& v)
    {
        // copy the length of v and a pointer to the data onto the local stack
        const size_t N = v.size();
        const int* p = (N > 0) ? &v.front() : NULL;
    
        __m128i mmSum = _mm_setzero_si128();
        int sum = 0;
        size_t i = 0;
    
        // unrolled loop that adds up 4 elements at a time
        for(; i < ROUND_DOWN(N, 4); i+=4)
        {
            mmSum = _mm_add_epi32(mmSum, _mm_loadu_si128((__m128i *)(p + i)));
        }
    
        // add up the four int values from mmSum into a single value
        mmSum = _mm_hadd_epi32(mmSum, mmSum);
        mmSum = _mm_hadd_epi32(mmSum, mmSum);
        sum = _mm_extract_epi32(mmSum, 0);
    
        // add up single values until all elements are covered
        for(; i < N; i++)
        {
            sum += p[i];
        }
    
        return sum;
    }
    
    int main()
    {
        std::vector<int> v;
    
        for (int i = 0; i < 10; ++i)
        {
            v.push_back(i);
        }
    
        int sum = accumulate(v);
    
        std::cout << sum << std::endl;
    
        return 0;
    }
    

    Compile and run:

    $ g++ -Wall -msse4 -O3 accumulate.cpp && ./a.out 
    45