Search code examples
c#f#stackunmanaged-memory

F# NativePtr.stackalloc slower then C# stackalloc - Decompiled Code Included


Continuing my F# performance testing. For some more background see here:

f# NativePtr.stackalloc in Struct Constructor

F# NativePtr.stackalloc Unexpected Stack Overflow

Now I've got stack arrays working in F#. However, for some reason the equivalent C# is approximately 50x faster. I've included the ILSpy decompiled versions below and it appears only 1 line is really different (inside stackAlloc).

What's going on here? Is the unchecked arithmetic really responsible for this big difference? Not sure how I could test this??

https://msdn.microsoft.com/en-us/library/a569z7k8.aspx

F# Code

#nowarn "9"

open Microsoft.FSharp.NativeInterop
open System
open System.Diagnostics    
open System.Runtime.CompilerServices        

[<MethodImpl(MethodImplOptions.NoInlining)>]
let stackAlloc x =
    let mutable ints:nativeptr<byte> = NativePtr.stackalloc x
    ()   

[<EntryPoint>]
let main argv = 
    printfn "%A" argv

    let size = 8192            
    let reps = 10000

    stackAlloc size // JIT
    let clock = Stopwatch()
    clock.Start()
    for i = 1 to reps do            
        stackAlloc size
    clock.Stop()

    let elapsed = clock.Elapsed.TotalMilliseconds
    let description = "F# NativePtr.stackalloc"
    Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed)

    Console.ReadKey() |> ignore
    0

C# Code

using System;
using System.Diagnostics;

namespace CSharpLanguageFeatures
{
    class CSharpStackArray
    {
        static void Main(string[] args)
        {
            int size = 8192;
            int reps = 10000;

            stackAlloc(size); // JIT
            Stopwatch clock = new Stopwatch();
            clock.Start();
            for (int i = 0; i < reps; i++)
            {
                stackAlloc(size);
            }
            clock.Stop();

            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed);
            Console.ReadKey();
        }

        public unsafe static void stackAlloc(int arraySize)
        {
            byte* pArr = stackalloc byte[arraySize];
        }
    }
}

F# Version Decompiled

using Microsoft.FSharp.Core;
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;

[CompilationMapping(SourceConstructFlags.Module)]
public static class FSharpStackArray
{
    [MethodImpl(MethodImplOptions.NoInlining)]
    public unsafe static void stackAlloc(int x)
    {
        IntPtr ints = stackalloc byte[x * sizeof(byte)];
    }

    [EntryPoint]
    public static int main(string[] argv)
    {
        PrintfFormat<FSharpFunc<string[], Unit>, TextWriter, Unit, Unit> format = new PrintfFormat<FSharpFunc<string[], Unit>, TextWriter, Unit, Unit, string[]>("%A");
        PrintfModule.PrintFormatLineToTextWriter<FSharpFunc<string[], Unit>>(Console.Out, format).Invoke(argv);
        FSharpStackArray.stackAlloc(8192);
        Stopwatch clock = new Stopwatch();
        clock.Start();
        for (int i = 1; i < 10001; i++)
        {
            FSharpStackArray.stackAlloc(8192);
        }
        clock.Stop();
        double elapsed = clock.Elapsed.TotalMilliseconds;
        Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", "F# NativePtr.stackalloc", 8192, 10000, elapsed);
        ConsoleKeyInfo consoleKeyInfo = Console.ReadKey();
        return 0;
    }
}

C# Version Decompiled

using System;
using System.Diagnostics;

namespace CSharpLanguageFeatures
{
    internal class CSharpStackArray
    {
        private static void Main(string[] args)
        {
            int size = 8192;
            int reps = 10000;
            CSharpStackArray.stackAlloc(size);
            Stopwatch clock = new Stopwatch();
            clock.Start();
            for (int i = 0; i < reps; i++)
            {
                CSharpStackArray.stackAlloc(size);
            }
            clock.Stop();
            string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
            string description = "C# stackalloc";
            Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", new object[]
            {
                description,
                size,
                reps,
                elapsed
            });
            Console.ReadKey();
        }

        public unsafe static void stackAlloc(int arraySize)
        {
            IntPtr arg_06_0 = stackalloc byte[checked(unchecked((UIntPtr)arraySize) * 1)];
        }
    }
}

F# Version IL - Byte Allocation

.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
{
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init (
        [0] native int ints
    )

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.Byte
    IL_0008: mul
    IL_0009: localloc
    IL_000b: stloc.0
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

C# Version IL - Byte Allocation

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
{
    // Method begins at RVA 0x2094
    // Code size 8 (0x8)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: ldc.i4.1
    IL_0003: mul.ovf.un
    IL_0004: localloc
    IL_0006: pop
    IL_0007: ret
} // end of method CSharpStackArray::stackAlloc   

Updated F# IL - IntPtr Allocation

.method public static 
    void stackAlloc (
        int32 x
    ) cil managed noinlining 
{
    // Method begins at RVA 0x2050
    // Code size 13 (0xd)
    .maxstack 4
    .locals init (
        [0] native int ints
    )

    IL_0000: nop
    IL_0001: ldarg.0
    IL_0002: sizeof [mscorlib]System.IntPtr
    IL_0008: mul
    IL_0009: localloc
    IL_000b: stloc.0
    IL_000c: ret
} // end of method FSharpStackArray::stackAlloc

Updated C# IL - IntPtr Allocation

.method public hidebysig static 
    void stackAlloc (
        int32 arraySize
    ) cil managed 
{
    // Method begins at RVA 0x2415
    // Code size 13 (0xd)
    .maxstack 8

    IL_0000: ldarg.0
    IL_0001: conv.u
    IL_0002: sizeof [mscorlib]System.IntPtr
    IL_0008: mul.ovf.un
    IL_0009: localloc
    IL_000b: pop
    IL_000c: ret
} // end of method CSharpStackArray::stackAlloc

Solution

  • Thanks everyone for the help with this.

    The answer was that the C# compiler was not storing the pointer as a local. This was because the allocated memory was never needed. The lack of "sizeof" and the differing "mul" gave the C# another slight edge.

    F# Assembler - Differences Are Commented

    .method public static 
        void stackAlloc (
            int32 x
        ) cil managed noinlining 
    {
        // Method begins at RVA 0x2050
        // Code size 13 (0xd)
        .maxstack 4
        .locals init ( //***** Not in C# Version *****//
            [0] native int ints
        )
    
        IL_0000: nop
        IL_0001: ldarg.0
        IL_0002: sizeof [mscorlib]System.Byte //***** C# just uses "1" *****//
        IL_0008: mul //***** C# uses "mul.ovf.un" *****//
        IL_0009: localloc
        IL_000b: stloc.0 //***** Not in C# Version *****//
        IL_000c: ret
    } // end of method FSharpStackArray::stackAlloc
    

    C# Assembler - Differences Are Commented

    .method public hidebysig static 
        void stackAlloc (
            int32 arraySize
        ) cil managed 
    {
        // Method begins at RVA 0x2094
        // Code size 8 (0x8)
        .maxstack 8
    
        IL_0000: ldarg.0
        IL_0001: conv.u
        IL_0002: ldc.i4.1 //***** F# uses sizeof [mscorlib]System.Byte *****//
        IL_0003: mul.ovf.un //***** F# uses "mul" *****//
        IL_0004: localloc
        IL_0006: pop
        IL_0007: ret
    } // end of method CSharpStackArray::stackAlloc  
    

    This exercise has taught me a few things:

    1. Compilers perform a lot of optimization. Apparently identical high level code in different languages can result in quite different sets of machine instructions.
    2. When benchmarking dotnet languages you can read the Intermediate Assembly to really see whats going on. Use ILSpy for this.
    3. You can modify and compile Intermediate Assembly using ilasm.exe.
    4. the C# compiler is doing a better job here of removing unnecessary code. Once you set every byte in the allocated memory, the performance becomes very similar as expected initially.

    Final F# Code

    #nowarn "9"
    
    open Microsoft.FSharp.NativeInterop
    open System
    open System.Diagnostics    
    open System.Runtime.CompilerServices        
    
    [<MethodImpl(MethodImplOptions.NoInlining)>]
    let stackAlloc x =
        let mutable bytes:nativeptr<byte> = NativePtr.stackalloc x
        for i = 0 to (x - 1) do
            NativePtr.set bytes i (byte i)
        ()   
    
    [<EntryPoint>]
    let main argv = 
        printfn "%A" argv
    
        let size = 8192            
        let reps = 10000
    
        stackAlloc size // JIT
        let clock = Stopwatch()
        clock.Start()
        for i = 1 to reps do            
            stackAlloc size
        clock.Stop()
    
        let elapsed = clock.Elapsed.TotalMilliseconds
        let description = "F# NativePtr.stackalloc"
        Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed)
    
        Console.ReadKey() |> ignore
        0
    

    Final C# Code

    using System;
    using System.Diagnostics;
    
    namespace CSharpStackArray
    {
        class Program
        {
            static void Main(string[] args)
            {
                int size = 8192;
                int reps = 10000;
    
                stackAlloc(size); // JIT
                Stopwatch clock = new Stopwatch();
                clock.Start();
                for (int i = 0; i < reps; i++)
                {
                    stackAlloc(size);
                }
                clock.Stop();
    
                string elapsed = clock.Elapsed.TotalMilliseconds.ToString("#,##0.####");
                string description = "C# stackalloc";
                Console.WriteLine("{0} ({1} bytes, {2} reps): {3:#,##0.####}ms", description, size, reps, elapsed);
                Console.ReadKey();
            }
    
            public unsafe static void stackAlloc(int arraySize)
            {
                byte* pArr = stackalloc byte[arraySize];
                for (int i = 0; i < arraySize; i++)
                {
                    pArr[i] = (byte)i;
                }
            }
        }
    }