Search code examples
adaspark-ada

SPARK Ada: Overlays Without Copying


I am trying to create a view of an array object to better utilise SIMD vectors on the x86_64 platform.

Here's the main idea:

   type Char_Set_Index is range 0 .. 7;
   type Char_Set_Element is mod 2 ** 32;
   
   type Character_Set_Vector is array (Char_Set_Index) of Char_Set_Element
     with Alignment => 32,Component_Size => 32, Object_Size => 256, Size => 256;
   
   type Character_Set is array (Character) of Boolean
     with Alignment => 32, Component_Size => 1, Object_Size => 256, Size => 256;

Essentially, some of the operations in Ada.Character.Maps can better be processed using SIMD arithmetic. For instance the "=" operation, perhaps coded as,

   function "="
     (Left, Right : in Character_Set)
      return Boolean
   is
     (for all k in Character_Set'Range =>
         (Left(k) = Right(k)));

.. gives us the following output

.LFB4:
    .cfi_startproc
    movq    %rdi, %r8
    movq    %rsi, %rdi
    xorl    %esi, %esi
    jmp .L6
    .p2align 4,,10
    .p2align 3
.L10:
    addl    $1, %esi
    cmpl    $256, %esi
    je  .L9
.L6:
    movl    %esi, %edx
    movl    %esi, %ecx
    sarl    $3, %edx
    andl    $7, %ecx
    movslq  %edx, %rdx
    movzbl  (%rdi,%rdx), %eax
    xorb    (%r8,%rdx), %al
    shrb    %cl, %al
    testb   $1, %al
    je  .L10
    xorl    %eax, %eax
    ret
.L9:
    movl    $1, %eax
    ret
    .cfi_endproc

Critically, it is comparing each bit, and GCC won't vectorise it. However, if we write,

   function "="
     (Left, Right : in Character_Set)
      return Boolean
   is
      
      u : aliased constant Character_Set_Vector
        with Import, Address => Left'Address;
      
      v : aliased constant Character_Set_Vector
        with Import, Address => Right'Address;
   
      Temp : array (Char_Set_Index) of Integer;
      Sum  : Integer;
   
   begin
   
      for j in Temp'Range loop
         pragma Loop_Optimize (Vector);
         Temp(j) := (if u(j) = v(j) then 0 else 1);
      end loop;
   
      Sum := 0;
      for j in Temp'Range loop
         Sum := Sum + Temp(j);
      end loop;
   
      return Sum = 0;
   
   end "=";

We get the branch-free SIMD instructions that we kind of expect,

    .cfi_startproc
    vmovdqa (%rdi), %ymm1
    vpcmpeqd    (%rsi), %ymm1, %ymm1
    vpandn  .LC0(%rip), %ymm1, %ymm1
    vextracti128    $0x1, %ymm1, %xmm0
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $8, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $4, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vmovd   %xmm0, %eax
    testl   %eax, %eax
    sete    %al
    vzeroupper
    ret
    .cfi_endproc

Which all works rather well. Now, the problem at hand. If you push this code through SPARK Ada, there are a number of complaints regarding alignment, aliasing, and constants, so you have to end up writing,

   function "="
     (Left, Right : in Character_Set)
      return Boolean
   is
      
      Left_Aligned : constant Character_Set := Left
        with Alignment => 32;
      
      Right_Aligned : constant Character_Set := Right
        with Alignment => 32;
      
      u : aliased constant Character_Set_Vector
        with Import, Alignment => 32, Address => Left_Aligned'Address;
      
      v : aliased constant Character_Set_Vector
        with Import, Alignment => 32, Address => Right_Aligned'Address;
   
      Temp : array (Char_Set_Index) of Integer;
      Sum  : Integer;
   
   begin
   
      for j in Temp'Range loop
         pragma Loop_Optimize (Vector);
         Temp(j) := (if u(j) = v(j) then 0 else 1);
      end loop;
   
      Sum := 0;
      for j in Temp'Range loop
         Sum := Sum + Temp(j);
      end loop;
   
      return Sum = 0;
   
   end "=";

which gives us an awful lot of precopying, presumably to ensure that everything is aligned OK - even though the declarations already have the correct alignment,

    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    andq    $-32, %rsp
    vmovdqa (%rdi), %xmm2
    vmovdqa 16(%rdi), %xmm3
    vmovdqa (%rsi), %xmm4
    vmovdqa 16(%rsi), %xmm5
    vmovdqa %xmm2, -64(%rsp)
    vmovdqa %xmm3, -48(%rsp)
    vmovdqa -64(%rsp), %ymm6
    vmovdqa %xmm4, -32(%rsp)
    vmovdqa %xmm5, -16(%rsp)
    vpcmpeqd    -32(%rsp), %ymm6, %ymm1
    vpandn  .LC0(%rip), %ymm1, %ymm1
    vextracti128    $0x1, %ymm1, %xmm0
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $8, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vpsrldq $4, %xmm0, %xmm1
    vpaddd  %xmm1, %xmm0, %xmm0
    vmovd   %xmm0, %eax
    testl   %eax, %eax
    sete    %al
    vzeroupper
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc

Obviously, the only reason one would even bother with this is for greater performance, however, the SPARK Ada rules seem too restrictive in this case, hurting performance. So, my question is, is there a better way of doing this that doesn't result in the excessive moving data around, where, as far as I can tell, it's not required.

Incidentally, Ada.Unchecked_Conversion similarly does a lot of moving data around at the beginning, too.

Also, I realise that I can justify the SPARK Ada checks (false-positive) so I can use the Ada version, but I am hoping that I am missing something, here, and that there is an easier way to do this.

Perhaps there is a way of vectorising arrays of Booleans?

EDIT: I am compiling it using

gnatmake -O3 -mavx2 -gnatn -gnatp -S name-of-package.adb

Solution

  • The question of why the alignment of Left and Right is unknown within the body of the function is interesting. You indeed can neither assert on the alignment attribute nor add a precondition to the function stating a requirement on parameter alignment (at least for GNATprove FSF 11.2.0). There is some comment on the issue in the SPARK source code though (see line 3276 in spark_definition.adb).

    On the other hand, it seems that you can work around the additional copying of the unchecked conversion by applying the conversion in the loop. Below is what I was able to achieve with GNAT FSF 11.3.1:

    character_sets.ads

    package Character_Sets with SPARK_Mode is
       
       type Character_Set is array (Character) of Boolean
         with 
           Alignment      => 32,
           Component_Size => 1, 
           Object_Size    => 256, 
           Size           => 256;
       
       function "=" (Left, Right : in Character_Set) return Boolean;
    
    end Character_Sets;
    

    character_sets.adb

    with Ada.Unchecked_Conversion;
    
    package body Character_Sets with SPARK_Mode is
    
       type Char_Set_Index is range 0 .. 7;
       type Char_Set_Element is mod 2 ** 32;
       
       type Character_Set_Vector is array (Char_Set_Index) of aliased Char_Set_Element
         with 
           Alignment      => 32,
           Component_Size => 32, 
           Object_Size    => 256, 
           Size           => 256;
       
       function To_Vector is new Ada.Unchecked_Conversion
         (Source => Character_Set,
          Target => Character_Set_Vector);
       
       ---------
       -- "=" --
       ---------
    
       function "=" (Left, Right : in Character_Set) return Boolean is
          
          Temp : array (Char_Set_Index) of Integer;
          Sum  : Integer;
       
       begin
         
          for J in Temp'Range loop
             pragma Loop_Optimize (Vector);
             Temp (J) := (if To_Vector (Left) (J) = To_Vector (Right) (J) then 0 else 1);    --  !!!
          end loop;
       
          Sum := 0;
          for J in Temp'Range loop
             Sum := Sum + Temp (J);
          end loop;
       
          return Sum = 0;
       
       end "=";
       
    end Character_Sets;
    

    default.gpr

    project Default is
    
       for Source_Dirs use ("src");
       for Object_Dir use "obj";
       for Main use ();
    
       package Compiler is
          for Switches ("ada") use ("-O3", "-mavx2", "-gnatn", "-gnatp");
       end Compiler;
    
    end Default;
    

    output (objdump)

    $ objdump -d -M intel ./obj/character_sets.o 
    
    ./obj/character_sets.o:     file format elf64-x86-64
    
    
    Disassembly of section .text:
    
    0000000000000000 <character_sets__Tcharacter_setBIP>:
       0:   c3                      ret    
       1:   90                      nop
       2:   66 66 2e 0f 1f 84 00    data16 cs nop WORD PTR [rax+rax*1+0x0]
       9:   00 00 00 00 
       d:   0f 1f 00                nop    DWORD PTR [rax]
    
    0000000000000010 <character_sets__Tcharacter_set_vectorBIP>:
      10:   c3                      ret    
      11:   90                      nop
      12:   66 66 2e 0f 1f 84 00    data16 cs nop WORD PTR [rax+rax*1+0x0]
      19:   00 00 00 00 
      1d:   0f 1f 00                nop    DWORD PTR [rax]
    
    0000000000000020 <character_sets__Oeq>:
      20:   c5 fd 6f 0f             vmovdqa ymm1,YMMWORD PTR [rdi]
      24:   c5 f5 76 0e             vpcmpeqd ymm1,ymm1,YMMWORD PTR [rsi]
      28:   c5 f5 df 0d 00 00 00    vpandn ymm1,ymm1,YMMWORD PTR [rip+0x0]        # 30 <character_sets__Oeq+0x10>
      2f:   00 
      30:   c4 e3 7d 39 c8 01       vextracti128 xmm0,ymm1,0x1
      36:   c5 f9 fe c1             vpaddd xmm0,xmm0,xmm1
      3a:   c5 f1 73 d8 08          vpsrldq xmm1,xmm0,0x8
      3f:   c5 f9 fe c1             vpaddd xmm0,xmm0,xmm1
      43:   c5 f1 73 d8 04          vpsrldq xmm1,xmm0,0x4
      48:   c5 f9 fe c1             vpaddd xmm0,xmm0,xmm1
      4c:   c5 f9 7e c0             vmovd  eax,xmm0
      50:   85 c0                   test   eax,eax
      52:   0f 94 c0                sete   al
      55:   c5 f8 77                vzeroupper 
      58:   c3                      ret
    

    output (gnatprove)

    $ gnatprove -P ./default.gpr -f
    Phase 1 of 2: generation of Global contracts ...
    Phase 2 of 2: flow analysis and proof ...
    
    character_sets.adb:31:10: warning: pragma "Loop_Optimize" ignored (not yet supported)
       31 |         pragma Loop_Optimize (Vector);
          |         ^ here
    Summary logged in /home/deedee/72423385-spark-ada-overlays-without-copying/obj/gnatprove/gnatprove.out