Search code examples
c++image-processingsimdsimd-librarysynet

Optimization of image resizing (method Nearest) with using SIMD


I know that 'Nearest' method of image resizing is the fastest method. Nevertheless I search way to speed up it. Evident step is a precalculate indices:

void CalcIndex(int sizeS, int sizeD, int colors, int* idx)
{
    float scale = (float)sizeS / sizeD;
    for (size_t i = 0; i < sizeD; ++i)
    {
        int index = (int)::floor((i + 0.5f) * scale)
        idx[i] = Min(Max(index, 0), sizeS - 1) * colors;
    }
}

template<int colors> inline void CopyPixel(const uint8_t* src, uint8_t* dst)
{
    for (int i = 0; i < colors; ++i)
        dst[i] = src[i];
}

template<int colors> void Resize(const uint8_t* src, int srcW, int srcH, 
    uint8_t* dst, int dstW, int dstH)
{
    int idxY[dstH], idxX[dstW];//pre-calculated indices (see CalcIndex).
    for (int dy = 0; dy < dstH; dy++)
    {
        const uint8_t * srcY = src + idxY[dy] * srcW * colors;
        for (int dx = 0, offset = 0; dx < dstW; dx++, offset += colors)
            CopyPixel<N>(srcY + idxX[dx], dst + offset);
        dst += dstW * colors;
    }
}

Are the next optimization steps exist? For example with using SIMD or some other optimization technic.

P.S. Especially I am interesting in optimization of RGB (Colors = 3). And if I use current code I see that ARGB image (Colors = 4) is processing faster for 50% then RGB despite that it bigger for 30%.


Solution

  • I think that using of _mm256_i32gather_epi32 (AVX2) can give some performance gain for resizing in case of 32 bit pixels:

    inline void Gather32bit(const uint8_t * src, const int* idx, uint8_t* dst)
    {
        __m256i _idx = _mm256_loadu_si256((__m256i*)idx);
        __m256i val = _mm256_i32gather_epi32((int*)src, _idx, 1);
        _mm256_storeu_si256((__m256i*)dst, val);
    }
    
    template<> void Resize<4>(const uint8_t* src, int srcW, int srcH, 
        uint8_t* dst, int dstW, int dstH)
    {
        int idxY[dstH], idxX[dstW];//pre-calculated indices.
        size_t dstW8 = dstW & (8 - 1);
        for (int dy = 0; dy < dstH; dy++)
        {
            const uint8_t * srcY = src + idxY[dy] * srcW * 4;
            int dx = 0, offset = 0;
            for (; dx < dstW8; dx += 8, offset += 8*4)
                Gather32bit(srcY, idxX + dx,dst + offset);
            for (; dx < dstW; dx++, offset += 4)
                CopyPixel<N>(srcY + idxX[dx], dst + offset);
            dst += dstW * 4;
        }
    }
    

    P.S. After some modification this method can be applied to RGB24:

    const __m256i K8_SHUFFLE = _mm256_setr_epi8(
        0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1,
        0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1);
    const __m256i K32_PERMUTE = _mm256_setr_epi32(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, -1, -1);
    
    
    inline void Gather24bit(const uint8_t * src, const int* idx, uint8_t* dst)
    {
        __m256i _idx = _mm256_loadu_si256((__m256i*)idx);
        __m256i bgrx = _mm256_i32gather_epi32((int*)src, _idx, 1);
        __m256i bgr = _mm256_permutevar8x32_epi32(
            _mm256_shuffle_epi8(bgrx, K8_SHUFFLE), K32_PERMUTE);
        _mm256_storeu_si256((__m256i*)dst, bgr);
    }
    
    template<> void Resize<3>(const uint8_t* src, int srcW, int srcH, 
        uint8_t* dst, int dstW, int dstH)
    {
        int idxY[dstH], idxX[dstW];//pre-calculated indices.
        size_t dstW8 = dstW & (8 - 1);
        for (int dy = 0; dy < dstH; dy++)
        {
            const uint8_t * srcY = src + idxY[dy] * srcW * 3;
            int dx = 0, offset = 0;
            for (; dx < dstW8; dx += 8, offset += 8*3)
                Gather24bit(srcY, idxX + dx,dst + offset);
            for (; dx < dstW; dx++, offset += 3)
                CopyPixel<3>(srcY + idxX[dx], dst + offset);
            dst += dstW * 3;
        }
    }
    

    Note that if srcW < dstW then method of @Aki-Suihkonen is faster.