c#arrays image-processing intrinsics pixelformat

Speed up pixel format conversion - BGR packed to RGB planar

From a SDK I get images that have the pixel format BGR packed, i.e. BGRBGRBGR. For another application, I need to convert this format to RGB planar RRRGGGBBB. I don't want to use an extra library just for this task, so I have to use my own code to convert between the formats.

I am using C# .NET 4.5 32bit and the data is in byte arrays which have the same size.

Right now I am iterating through the array source and assigning the BGR values to their appropriate places in the target array, but that takes too long (250ms for a 1.3 megapixel image). The processor the code runs at is a Intel Atom E680 and has access to MMX, SSE, SSE2, SSE3, SSSE3.

Unfortunately I don't have knowledge of intrinsics and couldn't convert code for a similar problem like Fast method to copy memory with translation - ARGB to BGR to suit my needs.

The code I am currently using to convert between the pixel formats is:

// the array with the BGRBGRBGR pixel data
byte[] source;
// the array with the RRRGGGBBB pixel data
byte[] result;
// the amount of pixels in one channel, width*height
int imageSize;

for (int i = 0; i < source.Length; i += 3)
{
    result[i/3] = source[i + 2]; // R
    result[i/3 + imageSize] = source[i + 1]; // G
    result[i/3 + imageSize * 2] = source[i]; // B
}

I tried splitting the access to the source array into three loops, one for each channel, but it didn't really help. So I'm open to suggestions.

for (int i = 0; i < source.Length; i += 3)
{
    result[i/3] = source[i + 2]; // R
}

for (int i = 0; i < source.Length; i += 3)
{
    result[i/3 + imageSize] = source[i + 1]; // G
}

for (int i = 0; i < source.Length; i += 3)
{
    result[i/3 + imageSize * 2] = source[i]; // B
}

edit: I got it down to 180ms by removing the division and multiplication like this, but is there a way to make it even faster? It still is very slow which I guess is because the memory reads/writes aren't very optimal.

int targetPosition = 0;
int imageSize2 = imageSize * 2;
for (int i = 0; i < source.Length; i += 3)
{
    result[targetPosition] = source[i + 2]; // R
    targetPosition++;
}

targetPosition = 0;

for (int i = 0; i < source.Length; i += 3)
{
    result[targetPosition + imageSize] = source[i + 1]; // G
    targetPosition++;
}

targetPosition = 0;

for (int i = 0; i < source.Length; i += 3)
{
    result[targetPosition + imageSize2] = source[i]; // B
    targetPosition++;
}

Thanks to MBo's answer, I was able to reduce the time from 180ms to 90ms! Here is the code:

Converter.cpp:

#include "stdafx.h"

BOOL __stdcall DllMain(HINSTANCE hInst, DWORD dwReason, LPVOID lpReserved) {
return  TRUE;
}

const unsigned char Mask[] = { 0, 3, 6, 9, 
                           1, 4, 7, 10, 
                           2, 5, 8, 11, 
                           12, 13, 14, 15};

extern "C" __declspec(dllexport) char* __stdcall ConvertPixelFormat(unsigned char* source, unsigned char *target, int imgSize) {

_asm {
    //interleave r1g1b1 r2g2b2 r3g3b3 r4b4g4 r5b5g5 r6... to planar
    //           r1r2r3r4r5..... g1g2g3g4g5... b1b2b3b4b5...
        push edi
        push esi
        mov eax, source      //A address
        mov edx, target      //B address
        mov ecx, imgSize
        movdqu xmm5, Mask    //load shuffling mask
        mov edi, imgSize     //load interleave step
        mov esi, eax
        add esi, edi
        add esi, edi
        add esi, edi
        shr ecx, 2           //divide count by 4
        dec ecx              //exclude last array chunk
        jle Rest

    Cycle:
        movdqu xmm0, [eax]        //load 16 bytes
        pshufb xmm0, xmm5         //shuffle bytes, we are interested in 12 ones
        movd [edx], xmm0          //store 4 bytes of R
        psrldq xmm0, 4            //shift right register, now G is on the end
        movd [edx + edi], xmm0    //store 4 bytes of G to proper place
        psrldq xmm0, 4            //do the same for B
        movd [edx + 2 * edi], xmm0
        add eax, 12               //shift source index to the next portion
        add edx, 4                //shift destination index
        loop Cycle

    Rest:                       //treat the rest of array
        cmp eax, esi
        jae Finish
        mov ecx, [eax]
        mov [edx], cl           //R
        mov [edx + edi], ch     //G
        shr ecx, 16
        mov [edx + 2 * edi], cl //B
        add eax, 3
        add edx, 1
        jmp Rest

    Finish:
        pop esi
        pop edi
    }
}

C# file:

// Code to define the method
[DllImport("Converter.dll")]
unsafe static extern void ConvertPixelFormat(byte* source, byte* target, int imgSize);

// Code to execute the conversion
unsafe
{
    fixed (byte* sourcePointer = &source[0])
    {
        fixed (byte* resultPointer = &result[0])
        {
            ConvertPixelFormat(sourcePointer, resultPointer, imageSize);
        }
    }
}

Solution

I've implemented this interleaving problem in Delphi and examined built-in asm. I have not intrinsics, so used plain assembler.
pshufb is equal to _mm_shuffle_epi8 (SSSE3 intrinsic)

At every cycle step I load 16 bytes (r1g1b1 r2g2b2 r3g3b3 r4b4g4 r5b5g5 r6) to 128-bit XMM register, shuffle them into (r1r2r3r4 g1g2g3g4 b1b2b3b4 xxxx) order, and save r,g,b chunks to destination memory (ignoring last 4 bytes). Next step loads (r5b5g5 r6g6b6 r7g7b7 ...) and so on.

Note that for code simplification I did not treat the very tail of array properly in the first code version. Since you are able to use this code, I've made needed corrections.

First version issue example:
imgSize = 32
array size = 96 bytes
32/4 = 8 cycles
the last cycle starts from 84th byte and read 16 bytes upto 99th byte - so we run out of array range!
I just added guard bytes here: GetMem(A, Size * 3 + 15);, but for real task it could be unapplicable, so it is worth to have special treatment of the last array chunk.

This code takes 967 ms for pascal variant and 140 ms for asm variant to convert two hundred 1.3MP cadrs on i5-4670 machine (processor itself is 6-8x faster for single thread than Atom 680). Speed is about 0.75 GB/Sec (pas) and 5.4 GB/Sec (asm)

const
  Mask: array[0..15] of Byte = ( 0, 3, 6, 9,
                                 1, 4, 7, 10,
                                 2, 5, 8, 11,
                                 12, 13, 14, 15);
var
  A, B: PByteArray;
  i, N, Size: Integer;
  t1, t2: DWord;
begin
  Size := 1280 * 960 * 200;
  GetMem(A, Size * 3);
  GetMem(B, Size * 3);

  for i := 0 to Size - 1 do begin
    A[3 * i] := 1;
    A[3 * i + 1] := 2;
    A[3 * i + 2] := 3;
  end;

  t1 := GetTickCount;
  for i := 0 to Size - 1 do begin
    B[i] := A[3 * i];
    B[i + Size] := A[3 * i + 1];
    B[i + 2 * Size] := A[3 * i + 2];
  end;
  t2:= GetTickCount;

    //interleave r1g1b1 r2g2b2 r3g3b3 r4b4g4 r5b5g5 r6... to planar
    //r1r2r3r4r5..... g1g2g3g4g5... b1b2b3b4b5...
  asm
    push edi
    push esi
    mov eax, A      //A address
    mov edx, B      //B address
    mov ecx, Size
    movdqu xmm5, Mask   //load shuffling mask
    mov edi, Size       //load interleave step
    mov esi, eax
    add esi, edi
    add esi, edi
    add esi, edi
    shr ecx, 2      //divide count by 4
    dec ecx         //exclude last array chunk
    jle @@Rest

  @@Cycle:
    movdqu xmm0, [eax]   //load 16 bytes
    pshufb xmm0, xmm5    //shuffle bytes, we are interested in 12 ones
    movd [edx], xmm0     //store 4 bytes of R
    psrldq xmm0, 4        //shift right register, now G is on the end
    movd [edx + edi], xmm0   //store 4 bytes of G to proper place
    psrldq xmm0, 4            //do the same for B
    movd [edx + 2 * edi], xmm0
    add eax, 12               //shift source index to the next portion
    add edx, 4                //shift destination index
    loop @@Cycle

   @@Rest:       //treat the rest of array
    cmp eax, esi
    jae @@Finish
    mov ecx, [eax]
    mov [edx], cl   //R
    mov [edx + edi], ch  //G
    shr ecx, 16
    mov [edx + 2 * edi], cl //B
    add eax, 3
    add edx, 1
    jmp @@Rest
  @@Finish:

    pop esi
    pop edi
  end;

  Memo1.Lines.Add(Format('pas %d asm %d', [t2-t1, GetTickCount - t2]));
  FreeMem(A);
  FreeMem(B);