I want to multiply two NxN matrices using SIMD. I want to do matrix multiplication for 64-bit integers, and multiply one element of a matrix with another element with the same index. For example:
c[1][1] = a[1][1] * b[1][1]
An error occurs when multiplying with the _mm256_mullo_epi64
operation. I can't figure out why this is happening. Can I write the resulting value into a 256-bit register?
#include <iostream>
#include <immintrin.h>
using namespace std;
int avx_mult(__int64** A, __int64** B, __int64** C, int N) {
cout << "AVX mult:" << endl;
if (N < 4 || N % 4 != 0) return 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j += 4) {
// filling the resulting AMX register with zeros
__m256i c_line = _mm256_setzero_si256();
// load 4 long long elements from array A into AMX register
__m256i a_line = _mm256_loadu_si256((__m256i*) & A[i][j]);
// load 4 long long elements from array B into AMX register
__m256i b_line = _mm256_loadu_si256((__m256i*) & B[i][j]);
// multiplying two AVX registers
c_line = _mm256_mullo_epi64(a_line, b_line);
}
}
}
int main() {
const unsigned int N = 4; // array size
__int64** A = new __int64* [N];
__int64** B = new __int64* [N];
__int64** C = new __int64* [N];
for (int i = 0; i < N; i++) {
A[i] = new __int64[N];
B[i] = new __int64[N];
C[i] = new __int64[N];
}
for (int i = 0; i < N; i++) { // filling arrays
for (int j = 0; j < N; j++) {
A[i][j] = __int64(rand() % 10);
B[i][j] = __int64(rand() % 10);
C[i][j] = __int64(0);
}
}
avx_mult(A, B, C, N);
for (int i = 0; i < N; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
The code compiles, but the program stops on this line:
c_line = _mm256_mullo_epi64(a_line, b_line);
... with exit code 0xC000001D
: Illegal Instruction exception.
The Intel Intrinsics Documentation for _mm256_mullo_epi64
says:
Synopsis
__m256i _mm256_mullo_epi64 (__m256i a, __m256i b) #include <immintrin.h> Instruction: vpmullq ymm, ymm, ymm CPUID Flags: AVX512DQ + AVX512VL
Description
Multiply the packed 64-bit integers in
a
andb
, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers indst
.
My function arguments fit the description. Or is there some mistake?
Not every x86_64 processor supports every instruction. Namely, _mm256_mullo_epi64
requires AVX-512 extensions, and if the rest of your code works, but running this intrinsic result in executing an illegal instruction, then you're most likely running this code on a processor without AVX-512.
You can implement packed 64-bit multiplication with just AVX2 as well:
__m256i mul64_haswell (__m256i a, __m256i b) {
__m256i bswap = _mm256_shuffle_epi32(b,0xB1);
__m256i prodlh = _mm256_mullo_epi32(a,bswap);
__m256i prodlh2 = _mm256_srli_epi64(prodlh, 32);
__m256i prodlh3 = _mm256_add_epi32(prodlh2, prodlh);
__m256i prodlh4 = _mm256_and_si256(prodlh3, _mm256_set1_epi64x(0x00000000FFFFFFFF));
__m256i prodll = _mm256_mul_epu32(a,b);
__m256i prod = _mm256_add_epi64(prodll,prodlh4);
return prod;
}
This code is taken from @PeterCordes' answer to Fastest way to multiply an array of int64_t?