I am studying AVX-512. I have a question about VORPS.
The documentation says like this:
EVEX.512.0F.W0 56 /r VORPS zmm1 {k1}{z}, zmm2, zmm3/m512/m32bcst
Return the bitwise logical OR of packed single-precision floating-point values in zmm2 and zmm3/m512/m32bcst subject to writemask k1.
EVEX encoded versions: The first source operand is a ZMM/YMM/XMM register. The second source operand can be a ZMM/YMM/XMM register, a 512/256/128-bit memory location, or a 512/256/128-bit vector broadcasted from a 32-bit memory location. The destination operand is a ZMM/YMM/XMM register conditionally updated with writemask k1.
Ref: https://www.felixcloutier.com/x86/orps
What does "subject to writemask k1" mean?
Can anyone give a concrete example of k1 contribution in this instruction?
I wrote this code to do some experiment about VORPS: https://godbolt.org/z/fMcqoa
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
int main()
{
register uint8_t *st_data asm("rbx");
asm volatile(
// Fix stack alignment
"andq $~0x3f, %%rsp\n\t"
// Allocate stack
"subq $0x100, %%rsp\n\t"
// Take stack pointer, save it to st_data
"movq %%rsp, %[st_data]\n\t"
// Fill 64 bytes top of stack with 0x01
"movq %%rsp, %%rdi\n\t"
"movl $0x40, %%ecx\n\t"
"movl $0x1, %%eax\n\t"
"rep stosb\n\t"
// Fill 64 bytes next with 0x02
"incl %%eax\n\t"
"leaq 0x40(%%rsp), %%rdi\n\t"
"movl $0x40, %%ecx\n\t"
"rep stosb\n\t"
// Take 0x1 and 0x2 to ZMM register
"vmovdqa64 (%%rsp), %%zmm0\n\t"
"vmovdqa64 0x40(%%rsp), %%zmm1\n\t"
// Set write mask
"movq $0x123456, %%rax\n\t"
"kmovq %%rax, %%k0\n\t"
"kmovq %%rax, %%k1\n\t"
"kmovq %%rax, %%k2\n\t"
// Execute vorps, store the result to ZMM2
"vorps %%zmm0, %%zmm1, %%zmm2\n\t"
// Plug back the result to memory
"vmovdqa64 %%zmm2, 0x80(%%rsp)\n\t"
"vzeroupper"
: [st_data]"=r"(st_data)
:
: "rax", "rcx", "rdi", "zmm0", "zmm1",
"zmm2", "memory", "cc"
);
static const char *x[] = {
"Data 1:", "Data 2:", "Result:"
};
for (size_t i = 0; i < 3; i++) {
printf("%s\n", x[i]);
for (size_t j = 0; j < 8; j++) {
for (size_t k = 0; k < 8; k ++) {
printf("%02x ", *st_data++);
}
printf("\n");
}
printf("\n");
}
fflush(stdout);
asm volatile(
// sys_exit
"movl $0x3c, %eax\n\t"
"xorl %edi, %edi\n\t"
"syscall"
);
}
Here, I tried to change the value of k0, k1, k2. But the result is always the same.
Result:
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
The reason of why mask register did not affect the result is because I did not encode the mask register in the destination operand for vorps
.
In AT&T syntax, the usage is something like:
# Without z-bit (merge-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}
# With z-bit (zero-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}{z}
In GCC inline asm, the {}
have to be escaped like this:
# Without z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}
# With z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}
In that case, z-bit can be used to clear the value of destination operand.
For example, if before vorps
operation the value of zmm2
is:
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
and the value of zmm0
and zmm1
are the same with the above case in the question.
After these instructions:
// Set write mask
"movq $0b11111111, %%rax\n\t"
"kmovq %%rax, %%k1\n\t"
// Execute vorps, store the result to ZMM2
"vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"
// Plug back the result to memory
"vmovdqa64 %%zmm2, 0x80(%[buf])\n\t"
Then the result will be:
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
03 03 03 03 03 03 03 03
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
ff ff ff ff ff ff ff ff
Godbolt link: https://godbolt.org/z/4rq5M8
#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <stdalign.h>
int main()
{
alignas(64) uint8_t buf[0x100];
uint8_t *st_data = buf;
asm(
// Fill ZMM2 with 0xff garbage.
"vpternlogd $0xff, %%zmm2, %%zmm2, %%zmm2\n\t"
// Fill ZMM0 with 0x01
"movl $0x01010101, %%eax\n\t"
"vpbroadcastd %%eax, %%zmm0\n\t"
// Fill ZMM1 with 0x02
"movl $0x02020202, %%eax\n\t"
"vpbroadcastd %%eax, %%zmm1\n\t"
// Plug ZMM0 and ZMM1 value to memory to print later
"vmovdqa64 %%zmm0, %[buf_0x00]\n\t"
"vmovdqa64 %%zmm1, %[buf_0x40]\n\t"
// Set write mask
"movl $0b11111111, %%eax\n\t"
"kmovq %%rax, %%k1\n\t"
// vorps without z-bit (merge into ZMM2)
"vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}\n\t"
// // vorps with z-bit (zero-mask, overwrite ZMM2)
// "vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"
// Plug the result to memory
"vmovdqa64 %%zmm2, %[buf_0x80]\n\t"
#ifndef __AVX__
/*
* Note:
* If we pass -mavx or -mavx2 or -mavx512* and then we clobber
* AVX register(s) with inline assembly, then the compiler will
* yield "vzeroupper" after the inline assembly.
*
* So we should only put vzeroupper when there is no AVX flag
* to prevent duplicate vzeroupper.
*/
"vzeroupper"
#endif
: [buf_0x00]"=m"(*(uint8_t (*)[0x40])(buf + 0x00)),
[buf_0x40]"=m"(*(uint8_t (*)[0x40])(buf + 0x40)),
[buf_0x80]"=m"(*(uint8_t (*)[0x40])(buf + 0x80))
/*
* Yes, it is all `*(uint8_t (*)[0x40])`, meaning we
* are going to write 0x40 bytes for each constraint.
*/
:
: "rax", "zmm0", "zmm1", "zmm2", "k1"
);
static const char *x[] = {
"Data 1:", "Data 2:", "Result:"
};
for (size_t i = 0; i < 3; i++) {
printf("%s\n", x[i]);
for (size_t j = 0; j < 8; j++) {
for (size_t k = 0; k < 8; k ++) {
printf("%02x ", *st_data++);
}
printf("\n");
}
printf("\n");
}
return 0;
}