I've made a directshow filter to change contrast and brightness of my video. I want to speed it up.
HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
...
BYTE *pData; // Pointer to the actual image buffer
pMediaSample->GetPointer(&pData);
int numPixels = cxImage * cyImage;
...
prgb = (RGBTRIPLE*) pData;
for (int iPixel=0; iPixel < numPixels; iPixel++ ) {
RGBTRIPLE *ppixel = prgb + iPixel;
ppixel->rgbtGreen = ppixel->rgbtGreen * _contrastPower + _brightnessPower;
ppixel->rgbtBlue = ppixel->rgbtBlue * _contrastPower + _brightnessPower;
ppixel->rgbtRed = ppixel->rgbtRed * _contrastPower + _brightnessPower;
if(ppixel->rgbtGreen>255) ppixel->rgbtGreen = 255;
if(ppixel->rgbtBlue>255) ppixel->rgbtBlue = 255;
if(ppixel->rgbtRed>255) ppixel->rgbtRed = 255;
}
...
}
HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
BYTE *pData; // Pointer to the actual image buffer
long lDataLen; // Holds length of any given sample
int iPixel; // Used to loop through the image pixels
RGBTRIPLE *prgb; // Holds a pointer to the current pixel
AM_MEDIA_TYPE* pType = &m_pInput->CurrentMediaType();
VIDEOINFOHEADER *pvi = (VIDEOINFOHEADER *) pType->pbFormat;
ASSERT(pvi);
CheckPointer(pMediaSample,E_POINTER);
pMediaSample->GetPointer(&pData);
lDataLen = pMediaSample->GetSize();
// Get the image properties from the BITMAPINFOHEADER
int cxImage = pvi->bmiHeader.biWidth;
int cyImage = pvi->bmiHeader.biHeight;
int numPixels = cxImage * cyImage;
prgb = (RGBTRIPLE*) pData;
double dcontrast = 0.7;
__m128d cStore = _mm_set1_pd(dcontrast);
BYTE *pDataOutput = new BYTE[lDataLen];
for (iPixel=0; iPixel < numPixels; iPixel += 4 ) {
//unpack to 32 bits
__m128i current = _mm_unpacklo_epi8( _mm_loadu_si128( (__m128i*)( prgb+iPixel ) ), _mm_setzero_si128());
__m128d image = _mm_cvtepi32_pd(_mm_unpacklo_epi16(current, _mm_setzero_si128()));
//vector operations
__m128d result = _mm_mul_pd(cStore, image);
//pack back to 8 bits
__m128i pack_32 = _mm_cvtpd_epi32 (result);
__m128i pack_16 = _mm_packs_epi32 (pack_32, pack_32);
__m128i pack_8 = _mm_packus_epi16(pack_16, pack_16);
//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
//also tryed to store the result in the original array
//_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8); // blacks out the whole video
}
//assign the original pointer to point at the start of the new data array
pData = pDataOutput;
return NOERROR;
}
This code does nothing to the original stream:
//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
....
pData = pDataOutput;
This code blacks out the whole video:
_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8);
Am I using the SSE instructions correctly ?
How do I assign the modified data to the original media sample pointer ?
Maybe this example will be useful to you:
void Filter(const uint8_t * src, size_t width, size_t height, double contrast, double brightness, uint8_t * dst)
{
const int shift = 8;
size_t size = width*height*3;
__m128i _contrast16 = _mm_set1_epi16(int16_t(contrast*(1 << shift)));
__m128i _brightness16 = _mm_set1_epi16(int16_t(brightness*(1 << shift)));
for(size_t i = 0; i < size; i += sizeof(__m128i))
{
__m128i _src8 = _mm_load_si128((__m128i*)(src + i));
__m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
__m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
__m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcLo16)), shift);
__m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcHi16)), shift);
_mm_store_si128((__m128i*)(dst + i), _mm_packus_epi16(_dstLo16, _dstHi16));
}
}
If use the individual coefficients for each channel:
inline void Filter(const uint8_t * src, const __m128i & contrastLo, const __m128i & contrastHi,
const __m128i & brightnessLo, const __m128i & brightnessHi, int shift, uint8_t * dst)
{
__m128i _src8 = _mm_load_si128((__m128i*)src);
__m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
__m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
__m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(brightnessLo, _mm_mullo_epi16(contrastLo, _srcLo16)), shift);
__m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(brightnessHi, _mm_mullo_epi16(contrastHi, _srcHi16)), shift);
_mm_store_si128((__m128i*)dst, _mm_packus_epi16(_dstLo16, _dstHi16));
}
void Filter(const uint8_t * src, size_t width, size_t height, double contrast[3], double brightness[3], uint8_t * dst)
{
const int shift = 8;
size_t size = width*height*3;
const int16_t
c0 = int16_t(contrast[0]*(1 << shift)),
c1 = int16_t(contrast[1]*(1 << shift)),
c2 = int16_t(contrast[2]*(1 << shift));
const int16_t
b0 = int16_t(brightness[0]*(1 << shift)),
b1 = int16_t(brightness[1]*(1 << shift)),
b2 = int16_t(brightness[2]*(1 << shift));
__m128i _contrast[3], _brightness[3];
_contrast[0] = _mm_setr_epi16(c0, c1, c2, c0, c1, c2, c0, c1);
_contrast[1] = _mm_setr_epi16(c2, c0, c1, c2, c0, c1, c2, c0);
_contrast[2] = _mm_setr_epi16(c1, c2, c0, c1, c2, c0, c1, c2);
_brightness[0] = _mm_setr_epi16(b0, b1, b2, b0, b1, b2, b0, b1);
_brightness[1] = _mm_setr_epi16(b2, b0, b1, b2, b0, b1, b2, b0);
_brightness[2] = _mm_setr_epi16(b1, b2, b0, b1, b2, b0, b1, b2);
for(size_t i = 0; i < size;)
{
Filter(src + i, _contrast[0], _contrast[1], _brightness[0], _brightness[1], shift, dst + i);
i += sizeof(__m128i);
Filter(src + i, _contrast[2], _contrast[0], _brightness[2], _brightness[0], shift, dst + i);
i += sizeof(__m128i);
Filter(src + i, _contrast[1], _contrast[2], _brightness[1], _brightness[2], shift, dst + i);
i += sizeof(__m128i);
}
}