I know that 'Nearest' method of image resizing is the fastest method. Nevertheless I search way to speed up it. Evident step is a precalculate indices:
void CalcIndex(int sizeS, int sizeD, int colors, int* idx)
{
float scale = (float)sizeS / sizeD;
for (size_t i = 0; i < sizeD; ++i)
{
int index = (int)::floor((i + 0.5f) * scale)
idx[i] = Min(Max(index, 0), sizeS - 1) * colors;
}
}
template<int colors> inline void CopyPixel(const uint8_t* src, uint8_t* dst)
{
for (int i = 0; i < colors; ++i)
dst[i] = src[i];
}
template<int colors> void Resize(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices (see CalcIndex).
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * colors;
for (int dx = 0, offset = 0; dx < dstW; dx++, offset += colors)
CopyPixel<N>(srcY + idxX[dx], dst + offset);
dst += dstW * colors;
}
}
Are the next optimization steps exist? For example with using SIMD or some other optimization technic.
P.S. Especially I am interesting in optimization of RGB (Colors = 3
).
And if I use current code I see that ARGB image (Colors = 4
) is processing faster for 50% then RGB despite that it bigger for 30%.
I think that using of _mm256_i32gather_epi32 (AVX2) can give some performance gain for resizing in case of 32 bit pixels:
inline void Gather32bit(const uint8_t * src, const int* idx, uint8_t* dst)
{
__m256i _idx = _mm256_loadu_si256((__m256i*)idx);
__m256i val = _mm256_i32gather_epi32((int*)src, _idx, 1);
_mm256_storeu_si256((__m256i*)dst, val);
}
template<> void Resize<4>(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices.
size_t dstW8 = dstW & (8 - 1);
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * 4;
int dx = 0, offset = 0;
for (; dx < dstW8; dx += 8, offset += 8*4)
Gather32bit(srcY, idxX + dx,dst + offset);
for (; dx < dstW; dx++, offset += 4)
CopyPixel<N>(srcY + idxX[dx], dst + offset);
dst += dstW * 4;
}
}
P.S. After some modification this method can be applied to RGB24:
const __m256i K8_SHUFFLE = _mm256_setr_epi8(
0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1,
0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
const __m256i K32_PERMUTE = _mm256_setr_epi32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1);
inline void Gather24bit(const uint8_t * src, const int* idx, uint8_t* dst)
{
__m256i _idx = _mm256_loadu_si256((__m256i*)idx);
__m256i bgrx = _mm256_i32gather_epi32((int*)src, _idx, 1);
__m256i bgr = _mm256_permutevar8x32_epi32(
_mm256_shuffle_epi8(bgrx, K8_SHUFFLE), K32_PERMUTE);
_mm256_storeu_si256((__m256i*)dst, bgr);
}
template<> void Resize<3>(const uint8_t* src, int srcW, int srcH,
uint8_t* dst, int dstW, int dstH)
{
int idxY[dstH], idxX[dstW];//pre-calculated indices.
size_t dstW8 = dstW & (8 - 1);
for (int dy = 0; dy < dstH; dy++)
{
const uint8_t * srcY = src + idxY[dy] * srcW * 3;
int dx = 0, offset = 0;
for (; dx < dstW8; dx += 8, offset += 8*3)
Gather24bit(srcY, idxX + dx,dst + offset);
for (; dx < dstW; dx++, offset += 3)
CopyPixel<3>(srcY + idxX[dx], dst + offset);
dst += dstW * 3;
}
}
Note that if srcW < dstW
then method of @Aki-Suihkonen is faster.