Search code examples
c#c++memory-managementsimdavx2

C++ to C# memory alignment issue


In C#, I'm storing RGB image data in a byte[] array ([r, g, b, r, g, b, ...]) and am attempting to convert it to grayscale. I'm implementing this grayscale conversion both in C# (using pointers) and in C++ (using SIMD instructions and P/Invoke) to compare performance gains when using C++ in C#.

The C# code works correctly and saves the image without issues, but when I use the C++ version, the saved grayscale image appears as random noise. Here is my main C# code:

static void Main(string[] args)
{
    DllLoader.LoadLibrary("ImageProcessingLib.dll");
    double totalElapsedMicrosecondsCpp = 0;
    double totalElapsedMicrosecondsCS = 0;

    // Load your image
    Bitmap bitmap = new Bitmap("nature.jpeg");

    // Convert the image to byte array
    byte[] rgbBytes = ConvertBitmapToByteArray(bitmap);
    byte[] rgbBytesCpp = ConvertBitmapToByteArray(bitmap);

    int runs = 2;
    for (int i = 0; i < runs; i++)
    {
        Stopwatch sw = Stopwatch.StartNew();

        // Call the P/Invoke function for C++ implementation
        fixed (byte* ptr = rgbBytesCpp)
        {
            DllLoader.ConvertRgbToGrayscale(ptr, rgbBytesCpp.Length);
        }

        sw.Stop();
        totalElapsedMicrosecondsCpp += sw.Elapsed.TotalMilliseconds * 1000;
    }

    for (int i = 0; i < runs; i++)
    {
        Stopwatch sw = Stopwatch.StartNew();

        // C# grayscale function
        ConvertRgbToGrayscale(rgbBytes);

        sw.Stop();
        totalElapsedMicrosecondsCS += sw.Elapsed.TotalMilliseconds * 1000;
    }

    double averageElapsedMicrosecondsPInvoke = totalElapsedMicrosecondsCpp / runs;
    double averageElapsedMicrosecondsCSharp = totalElapsedMicrosecondsCS / runs;

    Console.WriteLine("Average P/Invoke Grayscale Time: {0} microseconds", averageElapsedMicrosecondsPInvoke);
    Console.WriteLine("Average Native C# Grayscale Time: {0} microseconds", averageElapsedMicrosecondsCSharp);

    SaveGrayscaleImage(rgbBytesCpp, bitmap.Width, bitmap.Height, "Cpp.jpg");
    SaveGrayscaleImage(rgbBytes, bitmap.Width, bitmap.Height, "C#.jpg");

    Console.ReadLine();
}

public unsafe class DllLoader
{
    // Static constructor to load the DLL without invoking any functions from it
    static DllLoader()
    {
        LoadLibrary("ImageProcessingLib.dll");
    }

    [DllImport("kernel32.dll", CharSet = CharSet.Auto)]
    public static extern IntPtr LoadLibrary(string lpFileName);

    // P/Invoke to call the C++ ConvertRgbToGrayscale function
    [DllImport("ImageProcessingLib.dll", CallingConvention = CallingConvention.Cdecl)]
    public static extern byte* ConvertRgbToGrayscale(byte* pImage, int length);

    
}

I used both SIMD and non-SIMD approaches in my C++ function, but the SIMD approach causes memory issues. Here’s the SIMD code:

#include <immintrin.h>
#include <cstdint>

extern "C" __declspec(dllexport) void ConvertRgbToGrayscaleSIMD(uint8_t* rgbArray, size_t length) {
    
    // Ensure the array is aligned to 32-byte boundary (for AVX)
    //__m256i* alignedArray = reinterpret_cast<__m256i*>(_aligned_malloc(length, 32));

    // Copy data to aligned memory
    //memcpy(alignedArray, rgbArray, length);

    
    // Grayscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
    const uint8_t coeffR = 77;  // 0.3 * 256 ≈ 77
    const uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
    const uint8_t coeffB = 29;  // 0.11 * 256 ≈ 29

    // Load the grayscale coefficients into AVX registers (broadcast to 8 elements)
    __m256i coeff_r = _mm256_set1_epi8(coeffR);
    __m256i coeff_g = _mm256_set1_epi8(coeffG);
    __m256i coeff_b = _mm256_set1_epi8(coeffB);

    size_t i = 0;
    // Process 8 pixels (24 bytes) at once
    for (; i + 23 < length; i += 24) {  // 8 pixels (24 bytes) per loop
        // Load 24 bytes (8 pixels, RGBRGBRGB...)
        __m256i rgb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(rgbArray + i));

        // Extract the R, G, B channels
        __m256i r = _mm256_and_si256(rgb1, _mm256_set1_epi8(0xFF));               // R channel (bytes 0, 3, 6, 9, 12, 15, 18, 21)
        __m256i g = _mm256_and_si256(_mm256_srli_epi32(rgb1, 8), _mm256_set1_epi8(0xFF)); // G channel (bytes 1, 4, 7, 10, 13, 16, 19, 22)
        __m256i b = _mm256_and_si256(_mm256_srli_epi32(rgb1, 16), _mm256_set1_epi8(0xFF)); // B channel (bytes 2, 5, 8, 11, 14, 17, 20, 23)

        // Calculate grayscale
        __m256i gray_r = _mm256_mullo_epi16(r, coeff_r); // R * coeffR
        __m256i gray_g = _mm256_mullo_epi16(g, coeff_g); // G * coeffG
        __m256i gray_b = _mm256_mullo_epi16(b, coeff_b); // B * coeffB

        // Add the values (R * coeffR + G * coeffG + B * coeffB)
        __m256i gray = _mm256_add_epi8(
            _mm256_add_epi8(gray_r, gray_g),
            gray_b
        );

        // Right shift by 8 to normalize the grayscale values
        gray = _mm256_srli_epi16(gray, 8);

        // Duplicate grayscale values to R, G, B channels
        __m256i gray_rgb = _mm256_packus_epi16(gray, gray);

        // Store the resulting grayscale values back into the rgbArray
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(rgbArray + i), gray_rgb);
    }

    // Handle any leftover pixels that don't fit into full 8-pixel chunks
    for (; i + 2 < length; i += 3) {
        uint8_t r = rgbArray[i];
        uint8_t g = rgbArray[i + 1];
        uint8_t b = rgbArray[i + 2];

        uint8_t gray = static_cast<uint8_t>((coeffR * r + coeffG * g + coeffB * b) >> 8);

        rgbArray[i] = gray;
        rgbArray[i + 1] = gray;
        rgbArray[i + 2] = gray;
    }

    // Handle any leftover pixels that don't fit into full RGB triplets (i.e., length % 3 != 0)
    size_t remainder = length % 3;
    if (remainder > 0) {
        for (size_t j = length - remainder; j < length; ++j) {
            rgbArray[j] = rgbArray[j];  // No change
        }
    }

    //memcpy(rgbArray, alignedArray, length);

    //_aligned_free(alignedArray);



}

When I uncomment the aligned memory lines (_aligned_malloc and memcpy), the output image is correct, but it significantly slows down performance. I’d like to avoid this memory alignment while still using SIMD for better performance.

I am on .net framework 4.8 and my current performance results:

4k image RGB to grayscale conversion

C#: 18 ms (Working)

C++ P/Invoke Non SIMD : 13 ms (Working)

C++ P/Invoke SIMD : 7 ms (Random Noise Problem)

Question: Is there a way to perform SIMD grayscale conversion on this byte[] without needing aligned memory? Or, is there another efficient way to handle this that avoids the noise issue while maintaining performance?


Solution

  • Your C++ SIMD implementation is completely wrong.

    It’s relatively hard to efficiently process RGB24 pixels because all CPU registers have power of 2 size in bytes, i.e. when loading and storing data from memory, a register contains incomplete count of pixels.
    For the same reason, no modern graphics libraries and hardware APIs support 3 bytes/pixel formats, instead they zero-pad each RGB pixel into 4 bytes.

    Anyway, try the following version, it should hopefully do what you need. It assumes you’re building your C++ codes with VC++, other compilers don’t provide intrinsics for rep movsb and rep stosb instructions.

    #include <stdint.h>
    #include <immintrin.h>
    #include <intrin.h>
    
    namespace
    {
        static const __m128i s_unpackTriplets = _mm_setr_epi8(
            0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 );
    
        // Load 24 bytes from memory, zero extending triplets from RGB into RGBA
        // The alpha bytes will be zeros
        inline __m256i loadRgb8( const uint8_t* rsi )
        {
            // Load 24 bytes into 2 SSE vectors, 16 and 8 bytes respectively
            const __m128i low = _mm_loadu_si128( ( const __m128i* )rsi );
            __m128i high = _mm_loadu_si64( rsi + 16 );
            // Make the high vector contain exactly 4 triplets = 12 bytes
            high = _mm_alignr_epi8( high, low, 12 );
            // Combine into AVX2 vector
            __m256i res = _mm256_setr_m128i( low, high );
            // Hope the compiler inlines this function, and moves the vbroadcasti128 outside of the loop
            const __m256i perm = _mm256_broadcastsi128_si256( s_unpackTriplets );
            // Unpack RGB24 into RGB32
            return _mm256_shuffle_epi8( res, perm );
        }
    
        // Greyscale coefficients approximated to integers: R = 0.3, G = 0.59, B = 0.11
        constexpr uint8_t coeffR = 77;  // 0.3 * 256 ≈ 77
        constexpr uint8_t coeffG = 150; // 0.59 * 256 ≈ 150
        constexpr uint8_t coeffB = 29;  // 0.11 * 256 ≈ 29
    
        // Compute vector of int32 lanes with r*coeffR + g*coeffG + b*coeffB
        inline __m256i makeGreyscale( __m256i rgba )
        {
            const __m256i lowBytesMask = _mm256_set1_epi32( 0x00FF00FF );
            __m256i rb = _mm256_and_si256( rgba, lowBytesMask );
            __m256i g = _mm256_and_si256( _mm256_srli_epi16( rgba, 8 ), lowBytesMask );
    
            // Scale red and blue channels, then add pairwise into int32 lanes
            constexpr int mulRbScalar = ( ( (int)coeffB ) << 16 ) | coeffR;
            const __m256i mulRb = _mm256_set1_epi32( mulRbScalar );
            rb = _mm256_madd_epi16( rb, mulRb );
    
            // Scale green channel
            const __m256i mulGreen = _mm256_set1_epi32( coeffG );
            g = _mm256_mullo_epi16( g, mulGreen );
    
            // Compute the result in 32-bit lanes
            return _mm256_add_epi32( rb, g );
        }
    
        static const __m256i s_packTriplets = _mm256_setr_epi8(
            // Low half of the vector: e0 e0 e0 e1 e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 
            1, 1, 1, 5, 5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1,
            // High half of the vector: e1 e1 e2 e2 e2 e3 e3 e3 0 0 0 0 e0 e0 e0 e1 
            5, 5, 9, 9, 9, 13, 13, 13, -1, -1, -1, -1, 1, 1, 1, 5 );
    
        // Extract second byte from each int32 lane, triplicate these bytes, and store 24 bytes to memory
        inline void storeRgb8( uint8_t* rdi, __m256i gs )
        {
            // Move bytes within 16 byte lanes
            gs = _mm256_shuffle_epi8( gs, s_packTriplets );
    
            // Split vector into halves
            __m128i low = _mm256_castsi256_si128( gs );
            const __m128i high = _mm256_extracti128_si256( gs, 1 );
            // Insert high 4 bytes from high into low
            low = _mm_blend_epi32( low, high, 0b1000 );
    
            // Store 24 RGB bytes
            _mm_storeu_si128( ( __m128i* )rdi, low );
            _mm_storeu_si64( rdi + 16, high );
        }
    
        inline void computeGreyscale8( uint8_t* ptr )
        {
            __m256i v = loadRgb8( ptr );
            v = makeGreyscale( v );
            storeRgb8( ptr, v );
        }
    }
    
    void ConvertRgbToGrayscaleSIMD( uint8_t* ptr, size_t length )
    {
        const size_t rem = length % 24;
    
        uint8_t* const endAligned = ptr + ( length - rem );
        for( ; ptr < endAligned; ptr += 24 )
            computeGreyscale8( ptr );
    
        if( rem != 0 )
        {
            // An easy way to handle remainder is using a local buffer of 24 bytes, reusing the implementation
            // Unlike memcpy / memset which are function calls and are subject to ABI conventions,
            // __movsb / __stosb don't destroy data in vector registers
            uint8_t remSpan[ 24 ];
            __movsb( remSpan, ptr, rem );
            __stosb( &remSpan[ rem ], 0, 24 - rem );
    
            computeGreyscale8( remSpan );
    
            __movsb( ptr, remSpan, rem );
        }
    }