SSE Directshow filter

Context

I've made a directshow filter to change contrast and brightness of my video. I want to speed it up.

Working filter without SSE

HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
    ...


    BYTE *pData;                // Pointer to the actual image buffer

    pMediaSample->GetPointer(&pData);

    int numPixels  = cxImage * cyImage;
    ...
    prgb = (RGBTRIPLE*) pData;

    for (int iPixel=0; iPixel < numPixels; iPixel++ ) {
       RGBTRIPLE *ppixel = prgb + iPixel;

       ppixel->rgbtGreen = ppixel->rgbtGreen * _contrastPower + _brightnessPower;
       ppixel->rgbtBlue  = ppixel->rgbtBlue  * _contrastPower + _brightnessPower;
       ppixel->rgbtRed   = ppixel->rgbtRed   * _contrastPower + _brightnessPower;  

       if(ppixel->rgbtGreen>255) ppixel->rgbtGreen = 255;
       if(ppixel->rgbtBlue>255)  ppixel->rgbtBlue  = 255;
       if(ppixel->rgbtRed>255)   ppixel->rgbtRed   = 255;
    }
    ...
}

Not working filter with SEE

HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
    BYTE *pData;                // Pointer to the actual image buffer
    long lDataLen;              // Holds length of any given sample
    int iPixel;                 // Used to loop through the image pixels        
    RGBTRIPLE *prgb;            // Holds a pointer to the current pixel

    AM_MEDIA_TYPE* pType = &m_pInput->CurrentMediaType();
    VIDEOINFOHEADER *pvi = (VIDEOINFOHEADER *) pType->pbFormat;

    ASSERT(pvi);

    CheckPointer(pMediaSample,E_POINTER);
    pMediaSample->GetPointer(&pData);
    lDataLen = pMediaSample->GetSize();

    // Get the image properties from the BITMAPINFOHEADER

    int cxImage    = pvi->bmiHeader.biWidth;
    int cyImage    = pvi->bmiHeader.biHeight;
    int numPixels  = cxImage * cyImage;

    prgb = (RGBTRIPLE*) pData;

    double dcontrast = 0.7;

    __m128d cStore = _mm_set1_pd(dcontrast); 

    BYTE *pDataOutput = new BYTE[lDataLen];

    for (iPixel=0; iPixel < numPixels; iPixel += 4 ) {

        //unpack to 32 bits
        __m128i current = _mm_unpacklo_epi8( _mm_loadu_si128( (__m128i*)( prgb+iPixel ) ), _mm_setzero_si128());
        __m128d  image  = _mm_cvtepi32_pd(_mm_unpacklo_epi16(current, _mm_setzero_si128()));

        //vector operations
        __m128d result = _mm_mul_pd(cStore, image);

         //pack back to 8 bits
        __m128i pack_32 = _mm_cvtpd_epi32 (result); 
        __m128i pack_16 = _mm_packs_epi32 (pack_32, pack_32); 
        __m128i pack_8  = _mm_packus_epi16(pack_16, pack_16); 

        //store the new pixel in pDataOutput
        _mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);

        //also tryed to store the result in the original array
        //_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8); // blacks out the whole video
    }

    //assign the original pointer to point at the start of the new data array       
    pData = pDataOutput;


    return NOERROR;
}

Problems

This code does nothing to the original stream:

//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
....
pData = pDataOutput;

This code blacks out the whole video:

 _mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8);

Questions

Am I using the SSE instructions correctly ?

How do I assign the modified data to the original media sample pointer ?

Solution

Maybe this example will be useful to you:

void Filter(const uint8_t * src, size_t width, size_t height, double contrast, double brightness,  uint8_t * dst)
{
    const int shift = 8;
    size_t size = width*height*3;
    __m128i _contrast16 = _mm_set1_epi16(int16_t(contrast*(1 << shift)));
    __m128i _brightness16 = _mm_set1_epi16(int16_t(brightness*(1 << shift)));
    for(size_t i = 0; i < size; i += sizeof(__m128i))
    {
        __m128i _src8 = _mm_load_si128((__m128i*)(src + i));
        __m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
        __m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
        __m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcLo16)), shift);
        __m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcHi16)), shift);
        _mm_store_si128((__m128i*)(dst + i), _mm_packus_epi16(_dstLo16, _dstHi16));
    }
}

If use the individual coefficients for each channel:

inline void Filter(const uint8_t * src, const __m128i & contrastLo, const __m128i & contrastHi, 
    const __m128i & brightnessLo, const __m128i & brightnessHi, int shift, uint8_t * dst)
{
    __m128i _src8 = _mm_load_si128((__m128i*)src);
    __m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
    __m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
    __m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(brightnessLo, _mm_mullo_epi16(contrastLo, _srcLo16)), shift);
    __m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(brightnessHi, _mm_mullo_epi16(contrastHi, _srcHi16)), shift);
    _mm_store_si128((__m128i*)dst, _mm_packus_epi16(_dstLo16, _dstHi16));
}


void Filter(const uint8_t * src, size_t width, size_t height, double contrast[3], double brightness[3],  uint8_t * dst)
{
    const int shift = 8;
    size_t size = width*height*3;
    const int16_t 
        c0 = int16_t(contrast[0]*(1 << shift)), 
        c1 = int16_t(contrast[1]*(1 << shift)), 
        c2 = int16_t(contrast[2]*(1 << shift));
    const int16_t 
        b0 = int16_t(brightness[0]*(1 << shift)), 
        b1 = int16_t(brightness[1]*(1 << shift)), 
        b2 = int16_t(brightness[2]*(1 << shift));

    __m128i _contrast[3], _brightness[3];
    _contrast[0] = _mm_setr_epi16(c0, c1, c2, c0, c1, c2, c0, c1);
    _contrast[1] = _mm_setr_epi16(c2, c0, c1, c2, c0, c1, c2, c0);
    _contrast[2] = _mm_setr_epi16(c1, c2, c0, c1, c2, c0, c1, c2);
    _brightness[0] = _mm_setr_epi16(b0, b1, b2, b0, b1, b2, b0, b1);
    _brightness[1] = _mm_setr_epi16(b2, b0, b1, b2, b0, b1, b2, b0);
    _brightness[2] = _mm_setr_epi16(b1, b2, b0, b1, b2, b0, b1, b2);
    for(size_t i = 0; i < size;)
    {
        Filter(src + i, _contrast[0], _contrast[1], _brightness[0], _brightness[1], shift, dst + i);
        i += sizeof(__m128i);
        Filter(src + i, _contrast[2], _contrast[0], _brightness[2], _brightness[0], shift, dst + i);
        i += sizeof(__m128i);
        Filter(src + i, _contrast[1], _contrast[2], _brightness[1], _brightness[2], shift, dst + i);
        i += sizeof(__m128i);
    }
}