In C#, I'm storing RGB image data in a byte[] array ([r, g, b, r, g, b, ...]) and am attempting to convert it to grayscale. I'm implementing this grayscale conversion both in C# (using pointers) and in C++ (using SIMD instructions and P/Invoke) to compare performance gains when using C++ in C#.
The C# code works correctly and saves the image without issues, but when I use the C++ version, the saved grayscale image appears as random noise. Here is my main C# code:
static void Main(string[] args)
{
DllLoader.LoadLibrary("ImageProcessingLib.dll");
double totalElapsedMicrosecondsCpp = 0;
double totalElapsedMicrosecondsCS = 0;
// Load your image
Bitmap bitmap = new Bitmap("nature.jpeg");
// Convert the image to byte array
byte[] rgbBytes = ConvertBitmapToByteArray(bitmap);
byte[] rgbBytesCpp = ConvertBitmapToByteArray(bitmap);
int runs = 2;
for (int i = 0; i < runs; i++)
{
Stopwatch sw = Stopwatch.StartNew();
// Call the P/Invoke function for C++ implementation
fixed (byte* ptr = rgbBytesCpp)
{
DllLoader.ConvertRgbToGrayscale(ptr, rgbBytesCpp.Length);
}
sw.Stop();
totalElapsedMicrosecondsCpp += sw.Elapsed.TotalMilliseconds * 1000;
}
for (int i = 0; i < runs; i++)
{
Stopwatch sw = Stopwatch.StartNew();
// C# grayscale function
ConvertRgbToGrayscale(rgbBytes);
sw.Stop();
totalElapsedMicrosecondsCS += sw.Elapsed.TotalMilliseconds * 1000;
}
double averageElapsedMicrosecondsPInvoke = totalElapsedMicrosecondsCpp / runs;
double averageElapsedMicrosecondsCSharp = totalElapsedMicrosecondsCS / runs;
Console.WriteLine("Average P/Invoke Grayscale Time: {0} microseconds", averageElapsedMicrosecondsPInvoke);
Console.WriteLine("Average Native C# Grayscale Time: {0} microseconds", averageElapsedMicrosecondsCSharp);
SaveGrayscaleImage(rgbBytesCpp, bitmap.Width, bitmap.Height, "Cpp.jpg");
SaveGrayscaleImage(rgbBytes, bitmap.Width, bitmap.Height, "C#.jpg");
Console.ReadLine();
}
public unsafe class DllLoader
{
// Static constructor to load the DLL without invoking any functions from it
static DllLoader()
{
LoadLibrary("ImageProcessingLib.dll");
}
[DllImport("kernel32.dll", CharSet = CharSet.Auto)]
public static extern IntPtr LoadLibrary(string lpFileName);
// P/Invoke to call the C++ ConvertRgbToGrayscale function
[DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
public static extern byte* ConvertRgbToGrayscale(byte* pImage, int length);
}
I used both SIMD and non-SIMD approaches in my C++ function, but the SIMD approach causes memory issues. Here’s the SIMD code:
#include <immintrin.h>
#include <cstdint>
extern "C" __declspec(dllexport) void ConvertRgbToGrayscaleSIMD(uint8_t* rgbArray, size_t length) {
// Ensure the array is aligned to 32-byte boundary (for AVX)
//__m256i* alignedArray = reinterpret_cast<__m256i*>(_aligned_malloc(length, 32));
// Copy data to aligned memory
//memcpy(alignedArray, rgbArray, length);
// Grayscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
const uint8_t coeffR = 77; // 0.3 * 256 ≈ 77
const uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
const uint8_t coeffB = 29; // 0.11 * 256 ≈ 29
// Load the grayscale coefficients into AVX registers (broadcast to 8 elements)
__m256i coeff_r = _mm256_set1_epi8(coeffR);
__m256i coeff_g = _mm256_set1_epi8(coeffG);
__m256i coeff_b = _mm256_set1_epi8(coeffB);
size_t i = 0;
// Process 8 pixels (24 bytes) at once
for (; i + 23 < length; i += 24) { // 8 pixels (24 bytes) per loop
// Load 24 bytes (8 pixels, RGBRGBRGB...)
__m256i rgb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(rgbArray + i));
// Extract the R, G, B channels
__m256i r = _mm256_and_si256(rgb1, _mm256_set1_epi8(0xFF)); // R channel (bytes 0, 3, 6, 9, 12, 15, 18, 21)
__m256i g = _mm256_and_si256(_mm256_srli_epi32(rgb1, 8), _mm256_set1_epi8(0xFF)); // G channel (bytes 1, 4, 7, 10, 13, 16, 19, 22)
__m256i b = _mm256_and_si256(_mm256_srli_epi32(rgb1, 16), _mm256_set1_epi8(0xFF)); // B channel (bytes 2, 5, 8, 11, 14, 17, 20, 23)
// Calculate grayscale
__m256i gray_r = _mm256_mullo_epi16(r, coeff_r); // R * coeffR
__m256i gray_g = _mm256_mullo_epi16(g, coeff_g); // G * coeffG
__m256i gray_b = _mm256_mullo_epi16(b, coeff_b); // B * coeffB
// Add the values (R * coeffR + G * coeffG + B * coeffB)
__m256i gray = _mm256_add_epi8(
_mm256_add_epi8(gray_r, gray_g),
gray_b
);
// Right shift by 8 to normalize the grayscale values
gray = _mm256_srli_epi16(gray, 8);
// Duplicate grayscale values to R, G, B channels
__m256i gray_rgb = _mm256_packus_epi16(gray, gray);
// Store the resulting grayscale values back into the rgbArray
_mm256_storeu_si256(reinterpret_cast<__m256i*>(rgbArray + i), gray_rgb);
}
// Handle any leftover pixels that don't fit into full 8-pixel chunks
for (; i + 2 < length; i += 3) {
uint8_t r = rgbArray[i];
uint8_t g = rgbArray[i + 1];
uint8_t b = rgbArray[i + 2];
uint8_t gray = static_cast<uint8_t>((coeffR * r + coeffG * g + coeffB * b) >> 8);
rgbArray[i] = gray;
rgbArray[i + 1] = gray;
rgbArray[i + 2] = gray;
}
// Handle any leftover pixels that don't fit into full RGB triplets (i.e., length % 3 != 0)
size_t remainder = length % 3;
if (remainder > 0) {
for (size_t j = length - remainder; j < length; ++j) {
rgbArray[j] = rgbArray[j]; // No change
}
}
//memcpy(rgbArray, alignedArray, length);
//_aligned_free(alignedArray);
}
When I uncomment the aligned memory lines (_aligned_malloc and memcpy), the output image is correct, but it significantly slows down performance. I’d like to avoid this memory alignment while still using SIMD for better performance.
I am on .net framework 4.8 and my current performance results:
4k image RGB to grayscale conversion
C#: 18 ms (Working)
C++ P/Invoke Non SIMD : 13 ms (Working)
C++ P/Invoke SIMD : 7 ms (Random Noise Problem)
Question: Is there a way to perform SIMD grayscale conversion on this byte[] without needing aligned memory? Or, is there another efficient way to handle this that avoids the noise issue while maintaining performance?
Your C++ SIMD implementation is completely wrong.
It’s relatively hard to efficiently process RGB24 pixels because all CPU registers have power of 2 size in bytes, i.e. when loading and storing data from memory, a register contains incomplete count of pixels.
For the same reason, no modern graphics libraries and hardware APIs support 3 bytes/pixel formats, instead they zero-pad each RGB pixel into 4 bytes.
Anyway, try the following version, it should hopefully do what you need. It assumes you’re building your C++ codes with VC++, other compilers don’t provide intrinsics for rep movsb
and rep stosb
instructions.
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
namespace
{
static const __m128i s_unpackTriplets = _mm_setr_epi8(
0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );
// Load 24 bytes from memory, zero extending triplets from RGB into RGBA
// The alpha bytes will be zeros
inline __m256i loadRgb8( const uint8_t* rsi )
{
// Load 24 bytes into 2 SSE vectors, 16 and 8 bytes respectively
const __m128i low = _mm_loadu_si128( ( const __m128i* )rsi );
__m128i high = _mm_loadu_si64( rsi + 16 );
// Make the high vector contain exactly 4 triplets = 12 bytes
high = _mm_alignr_epi8( high, low, 12 );
// Combine into AVX2 vector
__m256i res = _mm256_setr_m128i( low, high );
// Hope the compiler inlines this function, and moves the vbroadcasti128 outside of the loop
const __m256i perm = _mm256_broadcastsi128_si256( s_unpackTriplets );
// Unpack RGB24 into RGB32
return _mm256_shuffle_epi8( res, perm );
}
// Greyscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
constexpr uint8_t coeffR = 77; // 0.3 * 256 ≈ 77
constexpr uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
constexpr uint8_t coeffB = 29; // 0.11 * 256 ≈ 29
// Compute vector of int32 lanes with r*coeffR + g*coeffG + b*coeffB
inline __m256i makeGreyscale( __m256i rgba )
{
const __m256i lowBytesMask = _mm256_set1_epi32( 0x00FF00FF );
__m256i rb = _mm256_and_si256( rgba, lowBytesMask );
__m256i g = _mm256_and_si256( _mm256_srli_epi16( rgba, 8 ), lowBytesMask );
// Scale red and blue channels, then add pairwise into int32 lanes
constexpr int mulRbScalar = ( ( (int)coeffB ) << 16 ) | coeffR;
const __m256i mulRb = _mm256_set1_epi32( mulRbScalar );
rb = _mm256_madd_epi16( rb, mulRb );
// Scale green channel
const __m256i mulGreen = _mm256_set1_epi32( coeffG );
g = _mm256_mullo_epi16( g, mulGreen );
// Compute the result in 32-bit lanes
return _mm256_add_epi32( rb, g );
}
static const __m256i s_packTriplets = _mm256_setr_epi8(
// Low half of the vector: e0 e0 e0 e1 e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0
1, 1, 1, 5, 5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1,
// High half of the vector: e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 e0 e0 e0 e1
5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1, 1, 1, 1, 5 );
// Extract second byte from each int32 lane, triplicate these bytes, and store 24 bytes to memory
inline void storeRgb8( uint8_t* rdi, __m256i gs )
{
// Move bytes within 16 byte lanes
gs = _mm256_shuffle_epi8( gs, s_packTriplets );
// Split vector into halves
__m128i low = _mm256_castsi256_si128( gs );
const __m128i high = _mm256_extracti128_si256( gs, 1 );
// Insert high 4 bytes from high into low
low = _mm_blend_epi32( low, high, 0b1000 );
// Store 24 RGB bytes
_mm_storeu_si128( ( __m128i* )rdi, low );
_mm_storeu_si64( rdi + 16, high );
}
inline void computeGreyscale8( uint8_t* ptr )
{
__m256i v = loadRgb8( ptr );
v = makeGreyscale( v );
storeRgb8( ptr, v );
}
}
void ConvertRgbToGrayscaleSIMD( uint8_t* ptr, size_t length )
{
const size_t rem = length % 24;
uint8_t* const endAligned = ptr + ( length - rem );
for( ; ptr < endAligned; ptr += 24 )
computeGreyscale8( ptr );
if( rem != 0 )
{
// An easy way to handle remainder is using a local buffer of 24 bytes, reusing the implementation
// Unlike memcpy / memset which are function calls and are subject to ABI conventions,
// __movsb / __stosb don't destroy data in vector registers
uint8_t remSpan[ 24 ];
__movsb( remSpan, ptr, rem );
__stosb( &remSpan[ rem ], 0, 24 - rem );
computeGreyscale8( remSpan );
__movsb( ptr, remSpan, rem );
}
}