I tried to change this code to handle std::vector<int>
.
float accumulate(const std::vector<float>& v)
{
// copy the length of v and a pointer to the data onto the local stack
const size_t N = v.size();
const float* p = (N > 0) ? &v.front() : NULL;
__m128 mmSum = _mm_setzero_ps();
size_t i = 0;
// unrolled loop that adds up 4 elements at a time
for(; i < ROUND_DOWN(N, 4); i+=4)
{
mmSum = _mm_add_ps(mmSum, _mm_loadu_ps(p + i));
}
// add up single values until all elements are covered
for(; i < N; i++)
{
mmSum = _mm_add_ss(mmSum, _mm_load_ss(p + i));
}
// add up the four float values from mmSum into a single value and return
mmSum = _mm_hadd_ps(mmSum, mmSum);
mmSum = _mm_hadd_ps(mmSum, mmSum);
return _mm_cvtss_f32(mmSum);
}
Ref: http://fastcpp.blogspot.com.au/2011/04/how-to-process-stl-vector-using-sse.html
I changed _mm_setzero_ps
to _mm_setzero_si128
, _mm_loadu_ps
to mm_loadl_epi64
and _mm_add_ps
to _mm_add_epi64
.
I get this error:
error: cannot convert ‘const int*’ to ‘const __m128i* {aka const __vector(2) long long int*}’ for argument ‘1’ to ‘__m128i _mm_loadl_epi64(const __m128i*)’
mmSum = _mm_add_epi64(mmSum, _mm_loadl_epi64(p + i + 0));
I am novice in this field. Is there any good source to learn these things?
Here is an int
version which I just threw together:
#include <iostream>
#include <vector>
#include <smmintrin.h> // SSE4
#define ROUND_DOWN(m, n) ((m) & ~((n) - 1))
static int accumulate(const std::vector<int>& v)
{
// copy the length of v and a pointer to the data onto the local stack
const size_t N = v.size();
const int* p = (N > 0) ? &v.front() : NULL;
__m128i mmSum = _mm_setzero_si128();
int sum = 0;
size_t i = 0;
// unrolled loop that adds up 4 elements at a time
for(; i < ROUND_DOWN(N, 4); i+=4)
{
mmSum = _mm_add_epi32(mmSum, _mm_loadu_si128((__m128i *)(p + i)));
}
// add up the four int values from mmSum into a single value
mmSum = _mm_hadd_epi32(mmSum, mmSum);
mmSum = _mm_hadd_epi32(mmSum, mmSum);
sum = _mm_extract_epi32(mmSum, 0);
// add up single values until all elements are covered
for(; i < N; i++)
{
sum += p[i];
}
return sum;
}
int main()
{
std::vector<int> v;
for (int i = 0; i < 10; ++i)
{
v.push_back(i);
}
int sum = accumulate(v);
std::cout << sum << std::endl;
return 0;
}
Compile and run:
$ g++ -Wall -msse4 -O3 accumulate.cpp && ./a.out
45