Executing simple floating point arithmetic on CUDA GPUs gives slightly different answer

I have a very simple kernel.

function kernel!(u, r, β::Number)
    u .= u .* β .+ r
end

Clearly, it is embarrassingly parallel. I execute in on both the CPU and the GPU like so. (You might have to run this more than once since test cases are generated randomly, but I am yet to run into an accidental pass on my machine).

using Test
using CUDA
using LinearAlgebra: norm

@testset "What is my GPU doing?" begin
    N = 1024
    u = rand(N)
    u_cu = CuVector(u)
    @test Vector(u_cu) == u  # OK
    r = rand(N)
    r_cu = CuVector(r)
    @test Vector(r_cu) == r  # OK
    β = 3.0
    u .= u .* β .+ r
    u_cu .= u_cu .* β .+ r_cu
    @test_broken u == u_cu   # BROKEN
    @test norm(u - Vector(u_cu)) < 1.0e-14  # OK
    @test eltype(u_cu) === eltype(u)  # OK
end

So, the final result seems to have a margin of error around the machine epsilon. Which leads me to guess that one of them is reordering the operations since floating point arithmetic is not associative. I would like to get them to behave exactly the same.

How can I check if reordering really is the case, or whether it is something else?
How can I get them to give the exact same result?

(Also asked on the Julia lang discourse.)

Auxiliary question: is it possible to inspect PTX/SASS for broadcast operations on the device? @device_code_ptx and friends only seem to work for kernels invoked using @cuda.

Edit: using FMA

It was pointed out in the comments that FMA could be another reason. In general, using separate mul and add instructions results in an intermediate result of the multiplication that gets rounded. This can be avoided using FMA. So, I ran some tests using both FMAs and the old kernels.


@testset "FMA vs mul and add" begin
    N = 1024
    u = rand(N)
    u_cu = CuVector(u)
    @test Vector(u_cu) == u
    r = rand(N)
    r_cu = CuVector(r)
    x = similar(u)
    x_fma = similar(x)
    x_cu = similar(u_cu)
    @test Vector(r_cu) == r
    β = 3.0
    x .= u .* β .+ r
    x_fma .= fma.(u, β, r)
    @test x_fma != x  # FMA should differ from mul and add
    @test norm(x_fma - x) < 1.0e-14  # but not too different
    x_cu .= u_cu .* β .+ r_cu
    @test Vector(x_cu) != x
    @test norm(Vector(x_cu) - x) < 1.0e-14
    x_cu_cpy = copy(x_cu)
    x_cu .= fma.(u_cu, β, r_cu)
    @test x_cu == x_cu_cpy  # GPU FMA same as separate mul and add
end

Solution

It is not reordering of flops, it is FMA. CUDA.jl (at least 4.0.1) explicitly fuses muls and adds into FMAs, while in Julia you have to either use something like @fastmath, or explicitly use Base.fma.

The difference between an FMA and non-fused mul-add is that the non-fused instructions would result in rounding of the intermediate result of the mul, hence the difference in arithmetic results.

The generated assembly for CPU can be checked using @code_native. In my case, I find that fma.(u, beta, r) results in a vfmadd instruction, while u .= u .* beta .+ r does not.

The generated sass for the CUDA variant can be checked using CUDA.@device_code_sass. I am not used to reading sass, but one of the lines was this:

DFMA R6, R6, c[0x0][0x1b8], R8 ;,

which I guess is a double precision FMA. Changing the CPU version to explicitly use FMA solved this.

@testset "What is my GPU doing?" begin
           N = 1024
           u = rand(N)
           u_cu = CuVector(u)
           @test Vector(u_cu) == u  # OK
           r = rand(N)
           r_cu = CuVector(r)
           @test Vector(r_cu) == r  # OK
           β = 3.0
           u .= fma.(u, β, r)  # <---------------- changed to fma
           u_cu .= u_cu .* β .+ r_cu
           @test u == u_cu   # OK
       end

Output:

Test Summary:         | Pass  Total  Time
What is my GPU doing? |    3      3  0.0s
Test.DefaultTestSet("What is my GPU doing?", Any[], 3, false, false, true, 1.687446862214095e9, 1.687446862222168e9, false)

Appendix: generated sass

julia> @device_code_sass kernel1!(u_cu, r_cu, β)
// PTX CompilerJob of kernel #broadcast_kernel#26(CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(*), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float64, 1}, Tuple{Bool}, Tuple{Int64}}, Float64}}, Base.Broadcast.Extruded{CuDeviceVector{Float64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64) for sm_75

        .headerflags    @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
        .elftype        @"ET_EXEC"


//--------------------- .text._Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_ --------------------------
        .section        .text._Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,"ax",@progbits
        .sectioninfo    @"SHI_REGISTERS=18"
        .align  128
        .global         _Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_
        .type           _Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,@function
        .size           _Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,(.L_x_8 - _Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_)
        .other          _Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_:

.text._Z27julia_broadcast_kernel_314515CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedI12CuArrayStyleILi1EE5TupleI5OneToI5Int64EE2__S4_IS2_IS3_ILi1EEvS7_S4_I8ExtrudedIS0_IS1_Li1ELi1EES4_I4BoolES4_IS6_EES1_EES8_IS0_IS1_Li1ELi1EES4_IS9_ES4_IS6_EEEES6_:
; 
        IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
        IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x1f8] ;
        IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x1fc] ;
; 
        ISETP.GE.U32.AND P0, PT, R0, 0x1, PT ;
        ISETP.GE.AND.EX P0, PT, R2, RZ, PT, P0 ;
; 
   @!P0 EXIT ;
        LDC.U8 R0, c[0x0][0x1a8] ;
        S2R R6, SR_TID.X ;
        IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0xc] ;
        ULDC.U8 UR5, c[0x0][0x1e0] ;
; 
        IMAD.MOV.U32 R4, RZ, RZ, c[0x0][0x1f8] ;
        S2R R5, SR_CTAID.X ;
        ULOP3.LUT UR5, UR5, 0x1, URZ, 0xc0, !UPT ;
        IMAD.MOV.U32 R7, RZ, RZ, RZ ;
        LOP3.LUT R0, R0, 0x1, RZ, 0xc0, !PT ;
        IADD3 R6, R6, 0x1, RZ ;
        ISETP.NE.U32.AND P0, PT, R0, 0x1, PT ;
        IMAD R0, R2, c[0x0][0x0], RZ ;
        IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x1fc] ;
        SHF.R.S32.HI R3, RZ, 0x1f, R0 ;
; 
    @P0 BRA `(.L_x_0) ;
        ISETP.NE.AND P0, PT, RZ, UR5, PT ;
   @!P0 BRA `(.L_x_1) ;
; 
        IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
        IADD3 R9, P1, R6, -0x1, RZ ;
        IADD3 R5, P0, P2, R9, 0x1, -R0 ;
        IADD3.X R8, R7, -0x1, RZ, P1, !PT ;
; 
        IADD3 R5, P1, R0, R5, RZ ;
; 
        IADD3.X R6, R8, RZ, ~R3, P0, P2 ;
; 
        ISETP.GT.U32.AND P0, PT, R5, c[0x0][0x180], PT ;
        IMAD.X R6, R3, 0x1, R6, P1 ;
        ISETP.GT.AND.EX P0, PT, R6, c[0x0][0x184], PT, P0 ;
; 
    @P0 EXIT ;
        SHF.L.U64.HI R11, R9.reuse, 0x3, R8 ;
        IMAD.SHL.U32 R10, R9, 0x8, RZ ;
        SHF.L.U64.HI R14, R0, 0x3, R3 ;
; 
        IMAD.MOV.U32 R12, RZ, RZ, R6 ;

.L_x_2:
; 
        ULDC.64 UR4, c[0x0][0x188] ;
        ULDC.64 UR6, c[0x0][0x1c0] ;
        LDG.E.64.SYS R6, [R10.64+UR4] ;
        LDG.E.64.SYS R8, [R10.64+UR6] ;
        IADD3 R4, P0, R4, -0x1, RZ ;
        ULDC.64 UR4, c[0x0][0x168] ;
; 
        IADD3 R5, P1, R0, R5, RZ ;
        IADD3.X R2, R2, -0x1, RZ, P0, !PT ;
        ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
        ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 ;
; 
        DFMA R6, R6, c[0x0][0x1b8], R8 ;
        LEA R9, P2, R0, R10, 0x3 ;
; 
        IMAD.X R8, R3, 0x1, R12, P1 ;
        ISETP.GT.U32.AND P1, PT, R5, c[0x0][0x180], PT ;
        IMAD.X R12, R11, 0x1, R14, P2 ;
        ISETP.GT.AND.EX P1, PT, R8, c[0x0][0x184], PT, P1 ;
; 
        STG.E.64.SYS [R10.64+UR4], R6 ;
; 
   @!P0 EXIT ;
; 
        IMAD.MOV.U32 R11, RZ, RZ, R12 ;
        IMAD.MOV.U32 R10, RZ, RZ, R9 ;
        IMAD.MOV.U32 R12, RZ, RZ, R8 ;
; 
   @!P1 BRA `(.L_x_2) ;
        EXIT ;

.L_x_1:
; 
        IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
        ULDC.64 UR6, c[0x0][0x1e8] ;
        ULDC.64 UR4, c[0x0][0x1c0] ;
        IADD3 R5, P1, R6, -0x1, RZ ;
        ULEA UR4, UP0, UR6, UR4, 0x3 ;
        IADD3 R9, P0, P2, R5, 0x1, -R0 ;
        ULEA.HI.X UR5, UR6, UR5, UR7, 0x3, UP0 ;
        IADD3.X R6, R7, -0x1, RZ, P1, !PT ;
; 
        IADD3 R9, P1, R0, R9, RZ ;
; 
        IADD3.X R8, R6, RZ, ~R3, P0, P2 ;
; 
        ISETP.GT.U32.AND P0, PT, R9, c[0x0][0x180], PT ;
        IMAD.X R12, R3, 0x1, R8, P1 ;
        ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x184], PT, P0 ;
; 
    @P0 EXIT ;
; 
        SHF.L.U64.HI R11, R5.reuse, 0x3, R6 ;
        IMAD.SHL.U32 R10, R5, 0x8, RZ ;
        SHF.L.U64.HI R14, R0, 0x3, R3 ;
; 
        IMAD.MOV.U32 R5, RZ, RZ, R9 ;

.L_x_3:
; 
        ULDC.64 UR6, c[0x0][0x188] ;
        LDG.E.64.SYS R8, [UR4+-0x8] ;
        LDG.E.64.SYS R6, [R10.64+UR6] ;
        IADD3 R4, P0, R4, -0x1, RZ ;
        IADD3.X R2, R2, -0x1, RZ, P0, !PT ;
; 
        ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
        ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 ;
        IADD3 R5, P1, R0.reuse, R5, RZ ;
; 
        ULDC.64 UR6, c[0x0][0x168] ;
; 
        DFMA R6, R6, c[0x0][0x1b8], R8 ;
        LEA R9, P2, R0, R10, 0x3 ;
; 
        IMAD.X R8, R3, 0x1, R12, P1 ;
        ISETP.GT.U32.AND P1, PT, R5, c[0x0][0x180], PT ;
; 
        STG.E.64.SYS [R10.64+UR6], R6 ;
        IMAD.X R12, R11, 0x1, R14, P2 ;
; 
        ISETP.GT.AND.EX P1, PT, R8, c[0x0][0x184], PT, P1 ;
; 
   @!P0 EXIT ;
; 
        IMAD.MOV.U32 R11, RZ, RZ, R12 ;
        IMAD.MOV.U32 R10, RZ, RZ, R9 ;
        IMAD.MOV.U32 R12, RZ, RZ, R8 ;
; 
   @!P1 BRA `(.L_x_3) ;
        EXIT ;

.L_x_0:
; 
        ISETP.NE.AND P0, PT, RZ, UR5, PT ;
        ULDC.64 UR8, c[0x0][0x1b0] ;
        ULDC.64 UR6, c[0x0][0x188] ;
        ULEA UR4, UP0, UR8, UR6, 0x3 ;
        ULEA.HI.X UR5, UR8, UR7, UR9, 0x3, UP0 ;
   @!P0 BRA `(.L_x_4) ;
; 
        IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
        IADD3 R5, P1, R6, -0x1, RZ ;
        IADD3 R9, P0, P2, R5, 0x1, -R0 ;
        IADD3.X R6, R7, -0x1, RZ, P1, !PT ;
; 
        IADD3 R9, P1, R0, R9, RZ ;
; 
        IADD3.X R8, R6, RZ, ~R3, P0, P2 ;
; 
        ISETP.GT.U32.AND P0, PT, R9, c[0x0][0x180], PT ;
        IMAD.X R12, R3, 0x1, R8, P1 ;
        ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x184], PT, P0 ;
    @P0 EXIT ;
; 
        SHF.L.U64.HI R11, R5.reuse, 0x3, R6 ;
        IMAD.SHL.U32 R10, R5, 0x8, RZ ;
        SHF.L.U64.HI R14, R0, 0x3, R3 ;
; 
        IMAD.MOV.U32 R5, RZ, RZ, R9 ;

.L_x_5:
; 
        ULDC.64 UR6, c[0x0][0x1c0] ;
        LDG.E.64.SYS R6, [UR4+-0x8] ;
        LDG.E.64.SYS R8, [R10.64+UR6] ;
        IADD3 R4, P0, R4, -0x1, RZ ;
        ULDC.64 UR6, c[0x0][0x168] ;
; 
        IADD3 R5, P1, R0, R5, RZ ;
        IADD3.X R2, R2, -0x1, RZ, P0, !PT ;
        ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
        ISETP.NE.AND.EX P0, PT, R2, RZ, PT, P0 ;
; 
        DFMA R6, R6, c[0x0][0x1b8], R8 ;
        LEA R9, P2, R0, R10, 0x3 ;
; 
        IMAD.X R8, R3, 0x1, R12, P1 ;
        ISETP.GT.U32.AND P1, PT, R5, c[0x0][0x180], PT ;
        IMAD.X R12, R11, 0x1, R14, P2 ;
        ISETP.GT.AND.EX P1, PT, R8, c[0x0][0x184], PT, P1 ;
; 
        STG.E.64.SYS [R10.64+UR6], R6 ;
; 
   @!P0 EXIT ;
; 
        IMAD.MOV.U32 R11, RZ, RZ, R12 ;
        IMAD.MOV.U32 R10, RZ, RZ, R9 ;
        IMAD.MOV.U32 R12, RZ, RZ, R8 ;
   @!P1 BRA `(.L_x_5) ;
        EXIT ;

.L_x_4:
; 
        IMAD.WIDE.U32 R6, R5, c[0x0][0x0], R6 ;
        ULDC.64 UR6, c[0x0][0x1e8] ;
        ULDC.64 UR8, c[0x0][0x1c0] ;
        IADD3 R11, P2, R6, -0x1, RZ ;
        ULEA UR8, UP0, UR6, UR8, 0x3 ;
        IADD3 R5, P0, P1, R11, 0x1, -R0 ;
        ULEA.HI.X UR7, UR6, UR9, UR7, 0x3, UP0 ;
        IADD3.X R6, R7, -0x1, RZ, P2, !PT ;
; 
        IADD3 R5, P2, R0, R5, RZ ;
; 
        IADD3.X R8, R6, RZ, ~R3, P0, P1 ;
; 
        ISETP.GT.U32.AND P0, PT, R5, c[0x0][0x180], PT ;
; 
        LEA R13, P1, R11, c[0x0][0x168], 0x3 ;
; 
        IMAD.X R12, R3, 0x1, R8, P2 ;
; 
        LEA.HI.X R11, R11, c[0x0][0x16c], R6, 0x3, P1 ;
; 
        ISETP.GT.AND.EX P0, PT, R12, c[0x0][0x184], PT, P0 ;
; 
    @P0 EXIT ;
; 
        SHF.L.U64.HI R14, R0, 0x3, R3 ;

.L_x_6:
; 
        UMOV UR6, UR8 ;
        LDG.E.64.SYS R6, [UR4+-0x8] ;
        LDG.E.64.SYS R8, [UR6+-0x8] ;
; 
        IADD3 R15, P1, R4, -0x1, RZ ;
; 
        IMAD.MOV.U32 R4, RZ, RZ, R13 ;
; 
        LEA R10, P2, R0, R13, 0x3 ;
        IADD3.X R2, R2, -0x1, RZ, P1, !PT ;
        ISETP.NE.U32.AND P1, PT, R15, RZ, PT ;
        ISETP.NE.AND.EX P1, PT, R2, RZ, PT, P1 ;
; 
        DFMA R6, R6, c[0x0][0x1b8], R8 ;
; 
        IADD3 R8, P0, R0, R5, RZ ;
; 
        IMAD.MOV.U32 R5, RZ, RZ, R11 ;
; 
        IMAD.X R11, R11, 0x1, R14, P2 ;
        IMAD.X R9, R3, 0x1, R12, P0 ;
        ISETP.GT.U32.AND P0, PT, R8, c[0x0][0x180], PT ;
; 
        STG.E.64.SYS [R4], R6 ;
; 
        ISETP.GT.AND.EX P0, PT, R9, c[0x0][0x184], PT, P0 ;
; 
   @!P1 EXIT ;
; 
        IMAD.MOV.U32 R4, RZ, RZ, R15 ;
        IMAD.MOV.U32 R13, RZ, RZ, R10 ;
        IMAD.MOV.U32 R5, RZ, RZ, R8 ;
        IMAD.MOV.U32 R12, RZ, RZ, R9 ;
; 
   @!P0 BRA `(.L_x_6) ;
        EXIT ;

.L_x_7:
        BRA `(.L_x_7);

.L_x_8: