Search code examples

The usage of writemask k1 in AVX-512 VORPS?

I am studying AVX-512. I have a question about VORPS.

The documentation says like this:

EVEX.512.0F.W0 56 /r VORPS zmm1 {k1}{z}, zmm2, zmm3/m512/m32bcst

Return the bitwise logical OR of packed single-precision floating-point values in zmm2 and zmm3/m512/m32bcst subject to writemask k1.

EVEX encoded versions: The first source operand is a ZMM/YMM/XMM register. The second source operand can be a ZMM/YMM/XMM register, a 512/256/128-bit memory location, or a 512/256/128-bit vector broadcasted from a 32-bit memory location. The destination operand is a ZMM/YMM/XMM register conditionally updated with writemask k1.


What does "subject to writemask k1" mean?

Can anyone give a concrete example of k1 contribution in this instruction?

I wrote this code to do some experiment about VORPS:


#include <stdio.h>
#include <stddef.h>
#include <stdint.h>

int main()
  register uint8_t *st_data asm("rbx");
  asm volatile(
    // Fix stack alignment
    "andq   $~0x3f, %%rsp\n\t"

    // Allocate stack
    "subq   $0x100, %%rsp\n\t"

    // Take stack pointer, save it to st_data
    "movq   %%rsp, %[st_data]\n\t"

    // Fill 64 bytes top of stack with 0x01
    "movq   %%rsp, %%rdi\n\t"
    "movl   $0x40, %%ecx\n\t"
    "movl   $0x1, %%eax\n\t"
    "rep    stosb\n\t"

    // Fill 64 bytes next with 0x02
    "incl   %%eax\n\t"
    "leaq   0x40(%%rsp), %%rdi\n\t"
    "movl   $0x40, %%ecx\n\t"
    "rep    stosb\n\t"

    // Take 0x1 and 0x2 to ZMM register
    "vmovdqa64  (%%rsp), %%zmm0\n\t"
    "vmovdqa64  0x40(%%rsp), %%zmm1\n\t"

    // Set write mask
    "movq   $0x123456, %%rax\n\t"
    "kmovq  %%rax, %%k0\n\t"
    "kmovq  %%rax, %%k1\n\t"
    "kmovq  %%rax, %%k2\n\t"

    // Execute vorps, store the result to ZMM2
    "vorps  %%zmm0, %%zmm1, %%zmm2\n\t"

    // Plug back the result to memory
    "vmovdqa64  %%zmm2, 0x80(%%rsp)\n\t"
    : [st_data]"=r"(st_data)
    : "rax", "rcx", "rdi", "zmm0", "zmm1",
      "zmm2", "memory", "cc"

  static const char *x[] = {
    "Data 1:", "Data 2:", "Result:"

  for (size_t i = 0; i < 3; i++) {
    printf("%s\n", x[i]);
    for (size_t j = 0; j < 8; j++) {
      for (size_t k = 0; k < 8; k ++) {
        printf("%02x ", *st_data++);


  asm volatile(
    // sys_exit
    "movl   $0x3c, %eax\n\t"
    "xorl   %edi, %edi\n\t"

Here, I tried to change the value of k0, k1, k2. But the result is always the same.

03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03


  • The reason of why mask register did not affect the result is because I did not encode the mask register in the destination operand for vorps.

    In AT&T syntax, the usage is something like:

    # Without z-bit (merge-masking)
    vorps %zmm0, %zmm1, %zmm2 {%k1}
    # With z-bit (zero-masking)
    vorps %zmm0, %zmm1, %zmm2 {%k1}{z}

    In GCC inline asm, the {} have to be escaped like this:

    # Without z-bit
    vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}
    # With z-bit
    vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}

    In that case, z-bit can be used to clear the value of destination operand.

    With z-bit

    For example, if before vorps operation the value of zmm2 is:

    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 

    and the value of zmm0 and zmm1 are the same with the above case in the question.

    After these instructions:

        // Set write mask
        "movq   $0b11111111, %%rax\n\t"
        "kmovq  %%rax, %%k1\n\t"
        // Execute vorps, store the result to ZMM2
        "vorps  %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"
        // Plug back the result to memory
        "vmovdqa64  %%zmm2, 0x80(%[buf])\n\t"

    Then the result will be:

    03 03 03 03 03 03 03 03 
    03 03 03 03 03 03 03 03 
    03 03 03 03 03 03 03 03 
    03 03 03 03 03 03 03 03 
    00 00 00 00 00 00 00 00 
    00 00 00 00 00 00 00 00 
    00 00 00 00 00 00 00 00 
    00 00 00 00 00 00 00 00 

    Without z-bit the result will be

    03 03 03 03 03 03 03 03 
    03 03 03 03 03 03 03 03 
    03 03 03 03 03 03 03 03 
    03 03 03 03 03 03 03 03 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 
    ff ff ff ff ff ff ff ff 

    Code example

    Godbolt link:

    #include <stdio.h>
    #include <stddef.h>
    #include <stdint.h>
    #include <stdalign.h>
    int main()
      alignas(64) uint8_t buf[0x100];
      uint8_t *st_data = buf;
        // Fill ZMM2 with 0xff garbage.
        "vpternlogd $0xff, %%zmm2, %%zmm2, %%zmm2\n\t"
        // Fill ZMM0 with 0x01
        "movl   $0x01010101, %%eax\n\t"
        "vpbroadcastd %%eax, %%zmm0\n\t"
        // Fill ZMM1 with 0x02
        "movl   $0x02020202, %%eax\n\t"
        "vpbroadcastd %%eax, %%zmm1\n\t"
        // Plug ZMM0 and ZMM1 value to memory to print later
        "vmovdqa64  %%zmm0, %[buf_0x00]\n\t"
        "vmovdqa64  %%zmm1, %[buf_0x40]\n\t"
        // Set write mask
        "movl   $0b11111111, %%eax\n\t"
        "kmovq  %%rax, %%k1\n\t"
        // vorps without z-bit (merge into ZMM2)
        "vorps  %%zmm0, %%zmm1, %%zmm2 %{%%k1%}\n\t"
        // // vorps with z-bit (zero-mask, overwrite ZMM2)
        // "vorps   %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"
        // Plug the result to memory
        "vmovdqa64  %%zmm2, %[buf_0x80]\n\t"
    #ifndef __AVX__
         * Note:
         * If we pass -mavx or -mavx2 or -mavx512* and then we clobber
         * AVX register(s) with inline assembly, then the compiler will
         * yield "vzeroupper" after the inline assembly.
         * So we should only put vzeroupper when there is no AVX flag
         * to prevent duplicate vzeroupper.
        : [buf_0x00]"=m"(*(uint8_t (*)[0x40])(buf + 0x00)),
          [buf_0x40]"=m"(*(uint8_t (*)[0x40])(buf + 0x40)),
          [buf_0x80]"=m"(*(uint8_t (*)[0x40])(buf + 0x80))
           * Yes, it is all `*(uint8_t (*)[0x40])`, meaning we
           * are going to write 0x40 bytes for each constraint.
        : "rax", "zmm0", "zmm1", "zmm2", "k1"
      static const char *x[] = {
        "Data 1:", "Data 2:", "Result:"
      for (size_t i = 0; i < 3; i++) {
        printf("%s\n", x[i]);
        for (size_t j = 0; j < 8; j++) {
          for (size_t k = 0; k < 8; k ++) {
            printf("%02x ", *st_data++);
      return 0;