Search code examples
cclangcompiler-optimizationstrict-aliasingrestrict-qualifier

Does Clang misunderstand the 'const' pointer specifier?


In the code below I saw that clang fails to perform better optimisation without implicit restrict pointer specifier:

#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>

typedef struct {
    uint32_t        event_type;
    uintptr_t       param;
} event_t;

typedef struct
{
    event_t                     *queue;
    size_t                      size;
    uint16_t                    num_of_items;
    uint8_t                     rd_idx;
    uint8_t                     wr_idx;
} queue_t;

static bool queue_is_full(const queue_t *const queue_ptr)
{
    return queue_ptr->num_of_items == queue_ptr->size;
}

static size_t queue_get_size_mask(const queue_t *const queue_ptr)
{
    return queue_ptr->size - 1;
}

int queue_enqueue(queue_t *const queue_ptr, const event_t *const event_ptr)
{
    if(queue_is_full(queue_ptr))
    {
        return 1;
    }

    queue_ptr->queue[queue_ptr->wr_idx++] = *event_ptr;
    queue_ptr->num_of_items++;
    queue_ptr->wr_idx &= queue_get_size_mask(queue_ptr);

    return 0;
}

I compiled this code with clang version 11.0.0 (clang-1100.0.32.5)

clang -O2 -arch armv7m -S test.c -o test.s

In the disassembled file I saw that the generated code re-reads the memory:

_queue_enqueue:
        .cfi_startproc
@ %bb.0:
        ldrh    r2, [r0, #8]            ---> reads the queue_ptr->num_of_items
        ldr     r3, [r0, #4]            ---> reads the queue_ptr->size
        cmp     r3, r2
        itt     eq
        moveq   r0, #1
        bxeq    lr
        ldrb    r2, [r0, #11]           ---> reads the queue_ptr->wr_idx
        adds    r3, r2, #1
        strb    r3, [r0, #11]           ---> stores the queue_ptr->wr_idx + 1
        ldr.w   r12, [r1]
        ldr     r3, [r0]
        ldr     r1, [r1, #4]
        str.w   r12, [r3, r2, lsl #3]
        add.w   r2, r3, r2, lsl #3
        str     r1, [r2, #4]
        ldrh    r1, [r0, #8]            ---> !!! re-reads the queue_ptr->num_of_items
        adds    r1, #1
        strh    r1, [r0, #8]
        ldrb    r1, [r0, #4]            ---> !!! re-reads the queue_ptr->size (only the first byte)
        ldrb    r2, [r0, #11]           ---> !!! re-reads the queue_ptr->wr_idx
        subs    r1, #1
        ands    r1, r2
        strb    r1, [r0, #11]           ---> !!! stores the updated queue_ptr->wr_idx once again after applying the mask
        movs    r0, #0
        bx      lr
        .cfi_endproc
                                        @ -- End function

After adding the restrict keyword to the pointers, these unneeded re-reads just vanished:

int queue_enqueue(queue_t * restrict const queue_ptr, const event_t * restrict const event_ptr)

I know that in clang, by default strict aliasing is disabled. But in this case, event_ptr pointer is defined as const so its object's content cannot be modified by this pointer, thus it cannot affect the content to which queue_ptr points (assuming the case when the objects overlap in the memory), right?

So is this a compiler optimisation bug or there is indeed some weird case when the object pointed by queue_ptr can be affected by event_ptr assuming this declaration:

int queue_enqueue(queue_t *const queue_ptr, const event_t *const event_ptr)

By the way, I tried to compile the same code for x86 target and inspected similar optimisation issue.


The generated assembly with the restrict keyword, doesn't contain the re-reads:

_queue_enqueue:
        .cfi_startproc
@ %bb.0:
        ldr     r3, [r0, #4]
        ldrh    r2, [r0, #8]
        cmp     r3, r2
        itt     eq
        moveq   r0, #1
        bxeq    lr
        push    {r4, r6, r7, lr}
        .cfi_def_cfa_offset 16
        .cfi_offset lr, -4
        .cfi_offset r7, -8
        .cfi_offset r6, -12
        .cfi_offset r4, -16
        add     r7, sp, #8
        .cfi_def_cfa r7, 8
        ldr.w   r12, [r1]
        ldr.w   lr, [r1, #4]
        ldrb    r1, [r0, #11]
        ldr     r4, [r0]
        subs    r3, #1
        str.w   r12, [r4, r1, lsl #3]
        add.w   r4, r4, r1, lsl #3
        adds    r1, #1
        ands    r1, r3
        str.w   lr, [r4, #4]
        strb    r1, [r0, #11]
        adds    r1, r2, #1
        strh    r1, [r0, #8]
        movs    r0, #0
        pop     {r4, r6, r7, pc}
        .cfi_endproc
                                        @ -- End function

Addition:

After some discussion with Lundin in the comments to his answer, I got the impression that the re-reads could be caused because the compiler would assume that queue_ptr->queue might potentially point to *queue_ptr itself. So I changed the queue_t struct to contain array instead of the pointer:

typedef struct
{
    event_t                     queue[256]; // changed from pointer to array with max size
    size_t                      size;
    uint16_t                    num_of_items;
    uint8_t                     rd_idx;
    uint8_t                     wr_idx;
} queue_t;

However the re-reads remained as previously. I still can't understand what could make the compiler think that queue_t fields may be modified and thus require re-reads... The following declaration eliminates the re-reads:

int queue_enqueue(queue_t * restrict const queue_ptr, const event_t *const event_ptr)

But why queue_ptr has to be declared as restrict pointer to prevent the re-reads I don't understand (unless it is a compiler optimization "bug").

P.S.

I also couldn't find any link to file/report an issue on clang that doesn't cause the compiler to crash...


Solution

  • [talking about the original program]

    This is caused by deficiency in the TBAA metadata, generated by Clang.

    If you emit LLVM IR with -S -emit-llvm you'll see (snipped for brevity):

    ...
    
      %9 = load i8, i8* %wr_idx, align 1, !tbaa !12
      %10 = trunc i32 %8 to i8
      %11 = add i8 %10, -1
      %conv4 = and i8 %11, %9
      store i8 %conv4, i8* %wr_idx, align 1, !tbaa !12
      br label %return
    
    ...
    
    !0 = !{i32 1, !"wchar_size", i32 4}
    !1 = !{i32 1, !"min_enum_size", i32 4}
    !2 = !{!"clang version 10.0.0 (/home/chill/src/llvm-project 07da145039e1a6a688fb2ac2035b7c062cc9d47d)"}
    !3 = !{!4, !9, i64 8}
    !4 = !{!"queue", !5, i64 0, !8, i64 4, !9, i64 8, !6, i64 10, !6, i64 11}
    !5 = !{!"any pointer", !6, i64 0}
    !6 = !{!"omnipotent char", !7, i64 0}
    !7 = !{!"Simple C/C++ TBAA"}
    !8 = !{!"int", !6, i64 0}
    !9 = !{!"short", !6, i64 0}
    !10 = !{!4, !8, i64 4}
    !11 = !{!4, !5, i64 0}
    !12 = !{!4, !6, i64 11}
    
    

    See the TBAA metadata !4: this is the type descriptor for queue_t (btw, I added names to structs, e.g. typedef struct queue ...) you may see empty string there). Each element in the description corresponds to struct fields and look at !5, which is the event_t *queue field: it's "any pointer"! At this point we've lost all the information about the actual type of the pointer, which tells me the compiler would assume writes through this pointer can modify any memory object.

    That said, there's a new form for TBAA metadata, which is more precise (still has deficiencies, but for that later ...)

    Compile the original program with -Xclang -new-struct-path-tbaa. My exact command line was (and I've included stddef.h instead of stdlib.h since that development build with no libc):

    ./bin/clang -I lib/clang/10.0.0/include/ -target armv7m-eabi -O2 -Xclang -new-struct-path-tbaa  -S queue.c
    

    The resulting assembly is (again, some fluff snipped):

    queue_enqueue:
        push    {r4, r5, r6, r7, lr}
        add r7, sp, #12
        str r11, [sp, #-4]!
        ldrh    r3, [r0, #8]
        ldr.w   r12, [r0, #4]
        cmp r12, r3
        bne .LBB0_2
        movs    r0, #1
        ldr r11, [sp], #4
        pop {r4, r5, r6, r7, pc}
    .LBB0_2:
        ldrb    r2, [r0, #11]                   ; load `wr_idx`
        ldr.w   lr, [r0]                        ; load `queue` member
        ldrd    r6, r1, [r1]                    ; load data pointed to by `event_ptr`
        add.w   r5, lr, r2, lsl #3              ; compute address to store the event
        str r1, [r5, #4]                        ; store `param`
        adds    r1, r3, #1                      ; increment `num_of_items`
        adds    r4, r2, #1                      ; increment `wr_idx`
        str.w   r6, [lr, r2, lsl #3]            ; store `event_type`
        strh    r1, [r0, #8]                    ; store new value for `num_of_items`
        sub.w   r1, r12, #1                     ; compute size mask
        ands    r1, r4                          ; bitwise and size mask with `wr_idx`
        strb    r1, [r0, #11]                   ; store new value for `wr_idx`
        movs    r0, #0
        ldr r11, [sp], #4
        pop {r4, r5, r6, r7, pc}
    

    Looks good, isn't it! :D

    I mentioned earlier there are deficiencies with the "new struct path", but for that: on the mailing list.

    PS. I'm afraid there's no general lesson to be learned in this case. In principle, the more information one is able to give the compiler - the better: things like restrict, strong typing (not gratuitous casts, type punning, etc), relevant function and variable attributes ... but not in this case, the original program already contained all the necessary information. It is just a compiler deficiency and the best way to tackle those is to raise awareness: ask on the mailing list and/or file bug reports.