From a SDK I get images that have the pixel format BGR packed, i.e. BGRBGRBGR
. For another application, I need to convert this format to RGB planar RRRGGGBBB
. I don't want to use an extra library just for this task, so I have to use my own code to convert between the formats.
I am using C# .NET 4.5 32bit and the data is in byte arrays which have the same size.
Right now I am iterating through the array source and assigning the BGR values to their appropriate places in the target array, but that takes too long (250ms for a 1.3 megapixel image). The processor the code runs at is a Intel Atom E680 and has access to MMX, SSE, SSE2, SSE3, SSSE3.
Unfortunately I don't have knowledge of intrinsics and couldn't convert code for a similar problem like Fast method to copy memory with translation - ARGB to BGR to suit my needs.
The code I am currently using to convert between the pixel formats is:
// the array with the BGRBGRBGR pixel data
byte[] source;
// the array with the RRRGGGBBB pixel data
byte[] result;
// the amount of pixels in one channel, width*height
int imageSize;
for (int i = 0; i < source.Length; i += 3)
{
result[i/3] = source[i + 2]; // R
result[i/3 + imageSize] = source[i + 1]; // G
result[i/3 + imageSize * 2] = source[i]; // B
}
I tried splitting the access to the source array into three loops, one for each channel, but it didn't really help. So I'm open to suggestions.
for (int i = 0; i < source.Length; i += 3)
{
result[i/3] = source[i + 2]; // R
}
for (int i = 0; i < source.Length; i += 3)
{
result[i/3 + imageSize] = source[i + 1]; // G
}
for (int i = 0; i < source.Length; i += 3)
{
result[i/3 + imageSize * 2] = source[i]; // B
}
edit: I got it down to 180ms by removing the division and multiplication like this, but is there a way to make it even faster? It still is very slow which I guess is because the memory reads/writes aren't very optimal.
int targetPosition = 0;
int imageSize2 = imageSize * 2;
for (int i = 0; i < source.Length; i += 3)
{
result[targetPosition] = source[i + 2]; // R
targetPosition++;
}
targetPosition = 0;
for (int i = 0; i < source.Length; i += 3)
{
result[targetPosition + imageSize] = source[i + 1]; // G
targetPosition++;
}
targetPosition = 0;
for (int i = 0; i < source.Length; i += 3)
{
result[targetPosition + imageSize2] = source[i]; // B
targetPosition++;
}
Thanks to MBo's answer, I was able to reduce the time from 180ms to 90ms! Here is the code:
Converter.cpp:
#include "stdafx.h"
BOOL __stdcall DllMain(HINSTANCE hInst, DWORD dwReason, LPVOID lpReserved) {
return TRUE;
}
const unsigned char Mask[] = { 0, 3, 6, 9,
1, 4, 7, 10,
2, 5, 8, 11,
12, 13, 14, 15};
extern "C" __declspec(dllexport) char* __stdcall ConvertPixelFormat(unsigned char* source, unsigned char *target, int imgSize) {
_asm {
//interleave r1g1b1 r2g2b2 r3g3b3 r4b4g4 r5b5g5 r6... to planar
// r1r2r3r4r5..... g1g2g3g4g5... b1b2b3b4b5...
push edi
push esi
mov eax, source //A address
mov edx, target //B address
mov ecx, imgSize
movdqu xmm5, Mask //load shuffling mask
mov edi, imgSize //load interleave step
mov esi, eax
add esi, edi
add esi, edi
add esi, edi
shr ecx, 2 //divide count by 4
dec ecx //exclude last array chunk
jle Rest
Cycle:
movdqu xmm0, [eax] //load 16 bytes
pshufb xmm0, xmm5 //shuffle bytes, we are interested in 12 ones
movd [edx], xmm0 //store 4 bytes of R
psrldq xmm0, 4 //shift right register, now G is on the end
movd [edx + edi], xmm0 //store 4 bytes of G to proper place
psrldq xmm0, 4 //do the same for B
movd [edx + 2 * edi], xmm0
add eax, 12 //shift source index to the next portion
add edx, 4 //shift destination index
loop Cycle
Rest: //treat the rest of array
cmp eax, esi
jae Finish
mov ecx, [eax]
mov [edx], cl //R
mov [edx + edi], ch //G
shr ecx, 16
mov [edx + 2 * edi], cl //B
add eax, 3
add edx, 1
jmp Rest
Finish:
pop esi
pop edi
}
}
C# file:
// Code to define the method
[DllImport("Converter.dll")]
unsafe static extern void ConvertPixelFormat(byte* source, byte* target, int imgSize);
// Code to execute the conversion
unsafe
{
fixed (byte* sourcePointer = &source[0])
{
fixed (byte* resultPointer = &result[0])
{
ConvertPixelFormat(sourcePointer, resultPointer, imageSize);
}
}
}
I've implemented this interleaving problem in Delphi and examined built-in asm. I have not intrinsics, so used plain assembler.
pshufb
is equal to _mm_shuffle_epi8
(SSSE3 intrinsic)
At every cycle step I load 16 bytes (r1g1b1 r2g2b2 r3g3b3 r4b4g4 r5b5g5 r6)
to 128-bit XMM register, shuffle them into (r1r2r3r4 g1g2g3g4 b1b2b3b4 xxxx)
order, and save r,g,b chunks to destination memory (ignoring last 4 bytes). Next step loads (r5b5g5 r6g6b6 r7g7b7 ...)
and so on.
Note that for code simplification I did not treat the very tail of array properly in the first code version. Since you are able to use this code, I've made needed corrections.
First version issue example:
imgSize = 32
array size = 96 bytes
32/4 = 8 cycles
the last cycle starts from 84th byte and read 16 bytes upto 99th byte - so we run out of array range!
I just added guard bytes here: GetMem(A, Size * 3 + 15);
, but for real task it could be unapplicable, so it is worth to have special treatment of the last array chunk.
This code takes 967 ms for pascal variant and 140 ms for asm variant to convert two hundred 1.3MP cadrs on i5-4670 machine (processor itself is 6-8x faster for single thread than Atom 680). Speed is about 0.75 GB/Sec (pas) and 5.4 GB/Sec (asm)
const
Mask: array[0..15] of Byte = ( 0, 3, 6, 9,
1, 4, 7, 10,
2, 5, 8, 11,
12, 13, 14, 15);
var
A, B: PByteArray;
i, N, Size: Integer;
t1, t2: DWord;
begin
Size := 1280 * 960 * 200;
GetMem(A, Size * 3);
GetMem(B, Size * 3);
for i := 0 to Size - 1 do begin
A[3 * i] := 1;
A[3 * i + 1] := 2;
A[3 * i + 2] := 3;
end;
t1 := GetTickCount;
for i := 0 to Size - 1 do begin
B[i] := A[3 * i];
B[i + Size] := A[3 * i + 1];
B[i + 2 * Size] := A[3 * i + 2];
end;
t2:= GetTickCount;
//interleave r1g1b1 r2g2b2 r3g3b3 r4b4g4 r5b5g5 r6... to planar
//r1r2r3r4r5..... g1g2g3g4g5... b1b2b3b4b5...
asm
push edi
push esi
mov eax, A //A address
mov edx, B //B address
mov ecx, Size
movdqu xmm5, Mask //load shuffling mask
mov edi, Size //load interleave step
mov esi, eax
add esi, edi
add esi, edi
add esi, edi
shr ecx, 2 //divide count by 4
dec ecx //exclude last array chunk
jle @@Rest
@@Cycle:
movdqu xmm0, [eax] //load 16 bytes
pshufb xmm0, xmm5 //shuffle bytes, we are interested in 12 ones
movd [edx], xmm0 //store 4 bytes of R
psrldq xmm0, 4 //shift right register, now G is on the end
movd [edx + edi], xmm0 //store 4 bytes of G to proper place
psrldq xmm0, 4 //do the same for B
movd [edx + 2 * edi], xmm0
add eax, 12 //shift source index to the next portion
add edx, 4 //shift destination index
loop @@Cycle
@@Rest: //treat the rest of array
cmp eax, esi
jae @@Finish
mov ecx, [eax]
mov [edx], cl //R
mov [edx + edi], ch //G
shr ecx, 16
mov [edx + 2 * edi], cl //B
add eax, 3
add edx, 1
jmp @@Rest
@@Finish:
pop esi
pop edi
end;
Memo1.Lines.Add(Format('pas %d asm %d', [t2-t1, GetTickCount - t2]));
FreeMem(A);
FreeMem(B);