assembly x86-64 inline-assembly att avx512

The usage of writemask k1 in AVX-512 VORPS?

I am studying AVX-512. I have a question about VORPS.

The documentation says like this:

EVEX.512.0F.W0 56 /r VORPS zmm1 {k1}{z}, zmm2, zmm3/m512/m32bcst

Return the bitwise logical OR of packed single-precision floating-point values in zmm2 and zmm3/m512/m32bcst subject to writemask k1.

EVEX encoded versions: The first source operand is a ZMM/YMM/XMM register. The second source operand can be a ZMM/YMM/XMM register, a 512/256/128-bit memory location, or a 512/256/128-bit vector broadcasted from a 32-bit memory location. The destination operand is a ZMM/YMM/XMM register conditionally updated with writemask k1.

Ref: https://www.felixcloutier.com/x86/orps

What does "subject to writemask k1" mean?

Can anyone give a concrete example of k1 contribution in this instruction?

I wrote this code to do some experiment about VORPS: https://godbolt.org/z/fMcqoa

Code

#include <stdio.h>
#include <stddef.h>
#include <stdint.h>

int main()
{
  register uint8_t *st_data asm("rbx");
  asm volatile(
    // Fix stack alignment
    "andq   $~0x3f, %%rsp\n\t"

    // Allocate stack
    "subq   $0x100, %%rsp\n\t"

    // Take stack pointer, save it to st_data
    "movq   %%rsp, %[st_data]\n\t"

    // Fill 64 bytes top of stack with 0x01
    "movq   %%rsp, %%rdi\n\t"
    "movl   $0x40, %%ecx\n\t"
    "movl   $0x1, %%eax\n\t"
    "rep    stosb\n\t"

    // Fill 64 bytes next with 0x02
    "incl   %%eax\n\t"
    "leaq   0x40(%%rsp), %%rdi\n\t"
    "movl   $0x40, %%ecx\n\t"
    "rep    stosb\n\t"

    // Take 0x1 and 0x2 to ZMM register
    "vmovdqa64  (%%rsp), %%zmm0\n\t"
    "vmovdqa64  0x40(%%rsp), %%zmm1\n\t"

    // Set write mask
    "movq   $0x123456, %%rax\n\t"
    "kmovq  %%rax, %%k0\n\t"
    "kmovq  %%rax, %%k1\n\t"
    "kmovq  %%rax, %%k2\n\t"

    // Execute vorps, store the result to ZMM2
    "vorps  %%zmm0, %%zmm1, %%zmm2\n\t"

    // Plug back the result to memory
    "vmovdqa64  %%zmm2, 0x80(%%rsp)\n\t"
    "vzeroupper"
    : [st_data]"=r"(st_data)
    :
    : "rax", "rcx", "rdi", "zmm0", "zmm1",
      "zmm2", "memory", "cc"
  );

  static const char *x[] = {
    "Data 1:", "Data 2:", "Result:"
  };

  for (size_t i = 0; i < 3; i++) {
    printf("%s\n", x[i]);
    for (size_t j = 0; j < 8; j++) {
      for (size_t k = 0; k < 8; k ++) {
        printf("%02x ", *st_data++);
      }
      printf("\n");
    }
    printf("\n");
  }

  fflush(stdout);

  asm volatile(
    // sys_exit
    "movl   $0x3c, %eax\n\t"
    "xorl   %edi, %edi\n\t"
    "syscall"
  );
}

Here, I tried to change the value of k0, k1, k2. But the result is always the same.

Result:
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03

Solution

The reason of why mask register did not affect the result is because I did not encode the mask register in the destination operand for vorps.

In AT&T syntax, the usage is something like:

# Without z-bit (merge-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}

# With z-bit (zero-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}{z}

In GCC inline asm, the {} have to be escaped like this:

# Without z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}

# With z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}

In that case, z-bit can be used to clear the value of destination operand.

With z-bit

For example, if before vorps operation the value of zmm2 is:

ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff

and the value of zmm0 and zmm1 are the same with the above case in the question.

After these instructions:

    // Set write mask
    "movq   $0b11111111, %%rax\n\t"
    "kmovq  %%rax, %%k1\n\t"

    // Execute vorps, store the result to ZMM2
    "vorps  %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"

    // Plug back the result to memory
    "vmovdqa64  %%zmm2, 0x80(%[buf])\n\t"

Then the result will be:

03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
00 00 00 00 00 00 00 00 
00 00 00 00 00 00 00 00 
00 00 00 00 00 00 00 00 
00 00 00 00 00 00 00 00

Without z-bit the result will be

03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff

Code example

Godbolt link: https://godbolt.org/z/4rq5M8

#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <stdalign.h>

int main()
{
  alignas(64) uint8_t buf[0x100];
  uint8_t *st_data = buf;

  asm(
    // Fill ZMM2 with 0xff garbage.
    "vpternlogd $0xff, %%zmm2, %%zmm2, %%zmm2\n\t"

    // Fill ZMM0 with 0x01
    "movl   $0x01010101, %%eax\n\t"
    "vpbroadcastd %%eax, %%zmm0\n\t"

    // Fill ZMM1 with 0x02
    "movl   $0x02020202, %%eax\n\t"
    "vpbroadcastd %%eax, %%zmm1\n\t"

    // Plug ZMM0 and ZMM1 value to memory to print later
    "vmovdqa64  %%zmm0, %[buf_0x00]\n\t"
    "vmovdqa64  %%zmm1, %[buf_0x40]\n\t"

    // Set write mask
    "movl   $0b11111111, %%eax\n\t"
    "kmovq  %%rax, %%k1\n\t"

    // vorps without z-bit (merge into ZMM2)
    "vorps  %%zmm0, %%zmm1, %%zmm2 %{%%k1%}\n\t"

    // // vorps with z-bit (zero-mask, overwrite ZMM2)
    // "vorps   %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"

    // Plug the result to memory
    "vmovdqa64  %%zmm2, %[buf_0x80]\n\t"

#ifndef __AVX__
    /*
     * Note:
     * If we pass -mavx or -mavx2 or -mavx512* and then we clobber
     * AVX register(s) with inline assembly, then the compiler will
     * yield "vzeroupper" after the inline assembly.
     *
     * So we should only put vzeroupper when there is no AVX flag
     * to prevent duplicate vzeroupper.
     */
    "vzeroupper"
#endif

    : [buf_0x00]"=m"(*(uint8_t (*)[0x40])(buf + 0x00)),
      [buf_0x40]"=m"(*(uint8_t (*)[0x40])(buf + 0x40)),
      [buf_0x80]"=m"(*(uint8_t (*)[0x40])(buf + 0x80))
      /*
       * Yes, it is all `*(uint8_t (*)[0x40])`, meaning we
       * are going to write 0x40 bytes for each constraint.
       */
    :
    : "rax", "zmm0", "zmm1", "zmm2", "k1"
  );

  static const char *x[] = {
    "Data 1:", "Data 2:", "Result:"
  };

  for (size_t i = 0; i < 3; i++) {
    printf("%s\n", x[i]);
    for (size_t j = 0; j < 8; j++) {
      for (size_t k = 0; k < 8; k ++) {
        printf("%02x ", *st_data++);
      }
      printf("\n");
    }
    printf("\n");
  }
  return 0;
}