C++ to C# memory alignment issue

In C#, I'm storing RGB image data in a byte[] array ([r, g, b, r, g, b, ...]) and am attempting to convert it to grayscale. I'm implementing this grayscale conversion both in C# (using pointers) and in C++ (using SIMD instructions and P/Invoke) to compare performance gains when using C++ in C#.

The C# code works correctly and saves the image without issues, but when I use the C++ version, the saved grayscale image appears as random noise. Here is my main C# code:

static void Main(string[] args)
{
    DllLoader.LoadLibrary("ImageProcessingLib.dll");
    double totalElapsedMicrosecondsCpp = 0;
    double totalElapsedMicrosecondsCS = 0;

    // Load your image
    Bitmap bitmap = new Bitmap("nature.jpeg");

    // Convert the image to byte array
    byte[] rgbBytes = ConvertBitmapToByteArray(bitmap);
    byte[] rgbBytesCpp = ConvertBitmapToByteArray(bitmap);

    int runs = 2;
    for (int i = 0; i < runs; i++)
    {
        Stopwatch sw = Stopwatch.StartNew();

        // Call the P/Invoke function for C++ implementation
        fixed (byte* ptr = rgbBytesCpp)
        {
            DllLoader.ConvertRgbToGrayscale(ptr, rgbBytesCpp.Length);
        }

        sw.Stop();
        totalElapsedMicrosecondsCpp += sw.Elapsed.TotalMilliseconds * 1000;
    }

    for (int i = 0; i < runs; i++)
    {
        Stopwatch sw = Stopwatch.StartNew();

        // C# grayscale function
        ConvertRgbToGrayscale(rgbBytes);

        sw.Stop();
        totalElapsedMicrosecondsCS += sw.Elapsed.TotalMilliseconds * 1000;
    }

    double averageElapsedMicrosecondsPInvoke = totalElapsedMicrosecondsCpp / runs;
    double averageElapsedMicrosecondsCSharp = totalElapsedMicrosecondsCS / runs;

    Console.WriteLine("Average P/Invoke Grayscale Time: {0} microseconds", averageElapsedMicrosecondsPInvoke);
    Console.WriteLine("Average Native C# Grayscale Time: {0} microseconds", averageElapsedMicrosecondsCSharp);

    SaveGrayscaleImage(rgbBytesCpp, bitmap.Width, bitmap.Height, "Cpp.jpg");
    SaveGrayscaleImage(rgbBytes, bitmap.Width, bitmap.Height, "C#.jpg");

    Console.ReadLine();
}

public unsafe class DllLoader
{
    // Static constructor to load the DLL without invoking any functions from it
    static DllLoader()
    {
        LoadLibrary("ImageProcessingLib.dll");
    }

    [DllImport("kernel32.dll", CharSet = CharSet.Auto)]
    public static extern IntPtr LoadLibrary(string lpFileName);

    // P/Invoke to call the C++ ConvertRgbToGrayscale function
    [DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
    public static extern byte* ConvertRgbToGrayscale(byte* pImage, int length);

    
}

I used both SIMD and non-SIMD approaches in my C++ function, but the SIMD approach causes memory issues. Here’s the SIMD code:

#include <immintrin.h>
#include <cstdint>

extern "C" __declspec(dllexport) void ConvertRgbToGrayscaleSIMD(uint8_t* rgbArray, size_t length) {
    
    // Ensure the array is aligned to 32-byte boundary (for AVX)
    //__m256i* alignedArray = reinterpret_cast<__m256i*>(_aligned_malloc(length, 32));

    // Copy data to aligned memory
    //memcpy(alignedArray, rgbArray, length);

    
    // Grayscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
    const uint8_t coeffR = 77;  // 0.3 * 256 ≈ 77
    const uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
    const uint8_t coeffB = 29;  // 0.11 * 256 ≈ 29

    // Load the grayscale coefficients into AVX registers (broadcast to 8 elements)
    __m256i coeff_r = _mm256_set1_epi8(coeffR);
    __m256i coeff_g = _mm256_set1_epi8(coeffG);
    __m256i coeff_b = _mm256_set1_epi8(coeffB);

    size_t i = 0;
    // Process 8 pixels (24 bytes) at once
    for (; i + 23 < length; i += 24) {  // 8 pixels (24 bytes) per loop
        // Load 24 bytes (8 pixels, RGBRGBRGB...)
        __m256i rgb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(rgbArray + i));

        // Extract the R, G, B channels
        __m256i r = _mm256_and_si256(rgb1, _mm256_set1_epi8(0xFF));               // R channel (bytes 0, 3, 6, 9, 12, 15, 18, 21)
        __m256i g = _mm256_and_si256(_mm256_srli_epi32(rgb1, 8), _mm256_set1_epi8(0xFF)); // G channel (bytes 1, 4, 7, 10, 13, 16, 19, 22)
        __m256i b = _mm256_and_si256(_mm256_srli_epi32(rgb1, 16), _mm256_set1_epi8(0xFF)); // B channel (bytes 2, 5, 8, 11, 14, 17, 20, 23)

        // Calculate grayscale
        __m256i gray_r = _mm256_mullo_epi16(r, coeff_r); // R * coeffR
        __m256i gray_g = _mm256_mullo_epi16(g, coeff_g); // G * coeffG
        __m256i gray_b = _mm256_mullo_epi16(b, coeff_b); // B * coeffB

        // Add the values (R * coeffR + G * coeffG + B * coeffB)
        __m256i gray = _mm256_add_epi8(
            _mm256_add_epi8(gray_r, gray_g),
            gray_b
        );

        // Right shift by 8 to normalize the grayscale values
        gray = _mm256_srli_epi16(gray, 8);

        // Duplicate grayscale values to R, G, B channels
        __m256i gray_rgb = _mm256_packus_epi16(gray, gray);

        // Store the resulting grayscale values back into the rgbArray
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(rgbArray + i), gray_rgb);
    }

    // Handle any leftover pixels that don't fit into full 8-pixel chunks
    for (; i + 2 < length; i += 3) {
        uint8_t r = rgbArray[i];
        uint8_t g = rgbArray[i + 1];
        uint8_t b = rgbArray[i + 2];

        uint8_t gray = static_cast<uint8_t>((coeffR * r + coeffG * g + coeffB * b) >> 8);

        rgbArray[i] = gray;
        rgbArray[i + 1] = gray;
        rgbArray[i + 2] = gray;
    }

    // Handle any leftover pixels that don't fit into full RGB triplets (i.e., length % 3 != 0)
    size_t remainder = length % 3;
    if (remainder > 0) {
        for (size_t j = length - remainder; j < length; ++j) {
            rgbArray[j] = rgbArray[j];  // No change
        }
    }

    //memcpy(rgbArray, alignedArray, length);

    //_aligned_free(alignedArray);



}

When I uncomment the aligned memory lines (_aligned_malloc and memcpy), the output image is correct, but it significantly slows down performance. I’d like to avoid this memory alignment while still using SIMD for better performance.

I am on .net framework 4.8 and my current performance results:

4k image RGB to grayscale conversion

C#: 18 ms (Working)

C++ P/Invoke Non SIMD : 13 ms (Working)

C++ P/Invoke SIMD : 7 ms (Random Noise Problem)

Question: Is there a way to perform SIMD grayscale conversion on this byte[] without needing aligned memory? Or, is there another efficient way to handle this that avoids the noise issue while maintaining performance?

Solution

Your C++ SIMD implementation is completely wrong.

It’s relatively hard to efficiently process RGB24 pixels because all CPU registers have power of 2 size in bytes, i.e. when loading and storing data from memory, a register contains incomplete count of pixels.
For the same reason, no modern graphics libraries and hardware APIs support 3 bytes/pixel formats, instead they zero-pad each RGB pixel into 4 bytes.

Anyway, try the following version, it should hopefully do what you need. It assumes you’re building your C++ codes with VC++, other compilers don’t provide intrinsics for rep movsb and rep stosb instructions.

#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>

namespace
{
    static const __m128i s_unpackTriplets = _mm_setr_epi8(
        0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );

    // Load 24 bytes from memory, zero extending triplets from RGB into RGBA
    // The alpha bytes will be zeros
    inline __m256i loadRgb8( const uint8_t* rsi )
    {
        // Load 24 bytes into 2 SSE vectors, 16 and 8 bytes respectively
        const __m128i low = _mm_loadu_si128( ( const __m128i* )rsi );
        __m128i high = _mm_loadu_si64( rsi + 16 );
        // Make the high vector contain exactly 4 triplets = 12 bytes
        high = _mm_alignr_epi8( high, low, 12 );
        // Combine into AVX2 vector
        __m256i res = _mm256_setr_m128i( low, high );
        // Hope the compiler inlines this function, and moves the vbroadcasti128 outside of the loop
        const __m256i perm = _mm256_broadcastsi128_si256( s_unpackTriplets );
        // Unpack RGB24 into RGB32
        return _mm256_shuffle_epi8( res, perm );
    }

    // Greyscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
    constexpr uint8_t coeffR = 77;  // 0.3 * 256 ≈ 77
    constexpr uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
    constexpr uint8_t coeffB = 29;  // 0.11 * 256 ≈ 29

    // Compute vector of int32 lanes with r*coeffR + g*coeffG + b*coeffB
    inline __m256i makeGreyscale( __m256i rgba )
    {
        const __m256i lowBytesMask = _mm256_set1_epi32( 0x00FF00FF );
        __m256i rb = _mm256_and_si256( rgba, lowBytesMask );
        __m256i g = _mm256_and_si256( _mm256_srli_epi16( rgba, 8 ), lowBytesMask );

        // Scale red and blue channels, then add pairwise into int32 lanes
        constexpr int mulRbScalar = ( ( (int)coeffB ) << 16 ) | coeffR;
        const __m256i mulRb = _mm256_set1_epi32( mulRbScalar );
        rb = _mm256_madd_epi16( rb, mulRb );

        // Scale green channel
        const __m256i mulGreen = _mm256_set1_epi32( coeffG );
        g = _mm256_mullo_epi16( g, mulGreen );

        // Compute the result in 32-bit lanes
        return _mm256_add_epi32( rb, g );
    }

    static const __m256i s_packTriplets = _mm256_setr_epi8(
        // Low half of the vector: e0 e0 e0 e1 e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 
        1, 1, 1, 5, 5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1,
        // High half of the vector: e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 e0 e0 e0 e1 
        5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1, 1, 1, 1, 5 );

    // Extract second byte from each int32 lane, triplicate these bytes, and store 24 bytes to memory
    inline void storeRgb8( uint8_t* rdi, __m256i gs )
    {
        // Move bytes within 16 byte lanes
        gs = _mm256_shuffle_epi8( gs, s_packTriplets );

        // Split vector into halves
        __m128i low = _mm256_castsi256_si128( gs );
        const __m128i high = _mm256_extracti128_si256( gs, 1 );
        // Insert high 4 bytes from high into low
        low = _mm_blend_epi32( low, high, 0b1000 );

        // Store 24 RGB bytes
        _mm_storeu_si128( ( __m128i* )rdi, low );
        _mm_storeu_si64( rdi + 16, high );
    }

    inline void computeGreyscale8( uint8_t* ptr )
    {
        __m256i v = loadRgb8( ptr );
        v = makeGreyscale( v );
        storeRgb8( ptr, v );
    }
}

void ConvertRgbToGrayscaleSIMD( uint8_t* ptr, size_t length )
{
    const size_t rem = length % 24;

    uint8_t* const endAligned = ptr + ( length - rem );
    for( ; ptr < endAligned; ptr += 24 )
        computeGreyscale8( ptr );

    if( rem != 0 )
    {
        // An easy way to handle remainder is using a local buffer of 24 bytes, reusing the implementation
        // Unlike memcpy / memset which are function calls and are subject to ABI conventions,
        // __movsb / __stosb don't destroy data in vector registers
        uint8_t remSpan[ 24 ];
        __movsb( remSpan, ptr, rem );
        __stosb( &remSpan[ rem ], 0, 24 - rem );

        computeGreyscale8( remSpan );

        __movsb( ptr, remSpan, rem );
    }
}