I am working on implementing the Fused Attention fprop graph pattern. As of now I am only combining two matrix multiplications, meaning g3 and g4 are empty. I believe I have also matched all the requirements for this graph but none of the engine configurations provided by the engine heuristic work when passed to an execution plan. When finalizing the exec plan using any of the engine configurations the status CUDNN_STATUS_NOT_SUPPORTED
is returned.
I have pasted the implementation I am using as well as the shapes and strides of all tensors used.
Why is CUDNN_STATUS_NOT_SUPPORTED
returned for every engine configuration returned by the heuristic. What changes do I have to make so that the execution plan finalizes with CUDNN_STATUS_SUCCESS
for at least one engine configuration.
qshape: 4: 1 1 10 64
qstride: 4: 640 640 64 1
kshape: 4: 1 1 64 10
kstride: 4: 640 640 1 64
sshape: 4: 1 1 10 10
sstride: 4: 100 100 10 1
vshape: 4: 1 1 10 64
vstride: 4: 640 640 64 1
oshape: 4: 1 1 10 64
ostride: 4: 640 640 64 1
#include <cudnn.h>
#include <iostream>
#include <vector>
#define CUDNN_CHECK(status) \
{ \
if (status != CUDNN_STATUS_SUCCESS) { \
fprintf(stderr, "cuDNN error: %s:%d:%s\n", __FILE__, __LINE__, \
cudnnGetErrorString(status)); \
std::exit(EXIT_FAILURE); \
} \
}
void print_vector(const std::vector<int64_t> &v, std::string name) {
std::cout << name << ": " << v.size() << ": ";
for (int64_t i : v) {
std::cout << i << " ";
}
std::cout << std::endl;
}
std::vector<int64_t> standard_4d_strides(const std::vector<int64_t> &shape) {
return {shape[1] * shape[2] * shape[3], shape[2] * shape[3], shape[3], 1};
}
cudnnBackendDescriptor_t
tensor_descriptor(const std::vector<int64_t> &shape,
const std::vector<int64_t> &strides, int64_t id,
cudnnDataType_t data_type, int64_t byte_alignment,
bool is_virtual, bool reordering_fp16x16 = false) {
cudnnBackendDescriptor_t desc;
CUDNN_CHECK(
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
CUDNN_TYPE_INT64, 1, &id));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DATA_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &data_type));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
CUDNN_TYPE_INT64, 1, &byte_alignment));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
CUDNN_TYPE_INT64,
(int64_t)shape.size(), shape.data()));
CUDNN_CHECK(cudnnBackendSetAttribute(
desc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64,
(int64_t)strides.size(), strides.data()));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_IS_VIRTUAL,
CUDNN_TYPE_BOOLEAN, 1, &is_virtual));
if (reordering_fp16x16) {
cudnnBackendTensorReordering_t reorder = CUDNN_TENSOR_REORDERING_F16x16;
CUDNN_CHECK(cudnnBackendSetAttribute(
desc, CUDNN_ATTR_TENSOR_REORDERING_MODE,
CUDNN_TYPE_TENSOR_REORDERING_MODE, 1, &reorder));
}
CUDNN_CHECK(cudnnBackendFinalize(desc));
return desc;
}
int main() {
std::vector<int64_t> shape_query = {1, 1, 10, 64};
std::vector<int64_t> strides_query = standard_4d_strides(shape_query);
std::vector<int64_t> shape_key = {1, 1, 10, 64};
std::vector<int64_t> strides_key = standard_4d_strides(shape_key);
std::swap(shape_key[2], shape_key[3]);
std::swap(strides_key[2], strides_key[3]);
std::vector<int64_t> shape_value = {1, 1, 10, 64};
std::vector<int64_t> strides_value = standard_4d_strides(shape_value);
std::vector<int64_t> shape_scores = {shape_query[0], shape_query[1],
shape_query[2], shape_key[3]};
std::vector<int64_t> strides_scores = standard_4d_strides(shape_scores);
std::vector<int64_t> shape_output = {shape_query[0], shape_query[1],
shape_query[2], shape_value[3]};
std::vector<int64_t> strides_output = standard_4d_strides(shape_output);
cudnnHandle_t handle;
CUDNN_CHECK(cudnnCreate(&handle));
cudnnDataType_t comp_type = CUDNN_DATA_FLOAT;
cudnnDataType_t data_type = CUDNN_DATA_HALF;
int64_t data_type_byte_alignment = 2;
cudnnBackendDescriptor_t query_desc =
tensor_descriptor(shape_query, strides_query, 'q', data_type,
data_type_byte_alignment, false);
cudnnBackendDescriptor_t key_desc =
tensor_descriptor(shape_key, strides_key, 'k', data_type,
data_type_byte_alignment, false);
cudnnBackendDescriptor_t value_desc =
tensor_descriptor(shape_value, strides_value, 'v', data_type,
data_type_byte_alignment, false);
cudnnBackendDescriptor_t scores_desc =
tensor_descriptor(shape_scores, strides_scores, 's', data_type,
data_type_byte_alignment, true, true);
cudnnBackendDescriptor_t output_desc =
tensor_descriptor(shape_output, strides_output, 'o', data_type,
data_type_byte_alignment, false);
print_vector(shape_query, "qshape");
print_vector(strides_query, "qstride");
print_vector(shape_key, "kshape");
print_vector(strides_key, "kstride");
print_vector(shape_scores, "sshape");
print_vector(strides_scores, "sstride");
print_vector(shape_value, "vshape");
print_vector(strides_value, "vstride");
print_vector(shape_output, "oshape");
print_vector(strides_output, "ostride");
cudnnBackendDescriptor_t matmul_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR,
&matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(matmul_desc,
CUDNN_ATTR_MATMUL_COMP_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &comp_type));
CUDNN_CHECK(cudnnBackendFinalize(matmul_desc));
cudnnBackendDescriptor_t op_matmul;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &op_matmul));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul, CUDNN_ATTR_OPERATION_MATMUL_DESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul, CUDNN_ATTR_OPERATION_MATMUL_ADESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &query_desc));
CUDNN_CHECK(
cudnnBackendSetAttribute(op_matmul, CUDNN_ATTR_OPERATION_MATMUL_BDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &key_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul, CUDNN_ATTR_OPERATION_MATMUL_CDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &scores_desc));
CUDNN_CHECK(cudnnBackendFinalize(op_matmul));
cudnnBackendDescriptor_t out_matmul_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR,
&out_matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(out_matmul_desc,
CUDNN_ATTR_MATMUL_COMP_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &comp_type));
CUDNN_CHECK(cudnnBackendFinalize(out_matmul_desc));
cudnnBackendDescriptor_t op_matmul_output;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &op_matmul_output));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_DESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &out_matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_ADESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &scores_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_BDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &value_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_CDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &output_desc));
CUDNN_CHECK(cudnnBackendFinalize(op_matmul_output));
cudnnBackendDescriptor_t op_graph;
cudnnBackendDescriptor_t ops[] = {op_matmul, op_matmul_output};
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
sizeof(ops) / sizeof(ops[0]), ops));
CUDNN_CHECK(cudnnBackendSetAttribute(op_graph,
CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle));
CUDNN_CHECK(cudnnBackendFinalize(op_graph));
cudnnBackendDescriptor_t heur_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &heur_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
cudnnBackendHeurMode_t heur_mode = CUDNN_HEUR_MODE_FALLBACK;
CUDNN_CHECK(cudnnBackendSetAttribute(heur_desc, CUDNN_ATTR_ENGINEHEUR_MODE,
CUDNN_TYPE_HEUR_MODE, 1, &heur_mode));
CUDNN_CHECK(cudnnBackendFinalize(heur_desc));
int64_t count = 0;
CUDNN_CHECK(cudnnBackendGetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
0, &count, NULL));
std::vector<cudnnBackendDescriptor_t> eng_cfgs(count);
for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &cfg));
}
CUDNN_CHECK(cudnnBackendGetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
count, nullptr, eng_cfgs.data()));
for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
cudnnBackendDescriptor_t exec_plan;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &exec_plan));
CUDNN_CHECK(cudnnBackendSetAttribute(
exec_plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cfg));
CUDNN_CHECK(cudnnBackendSetAttribute(exec_plan,
CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle));
cudnnStatus_t status = cudnnBackendFinalize(exec_plan);
std::cout << cudnnGetErrorString(status) << "\n";
if (status == CUDNN_STATUS_SUCCESS) {
std::cout << "success\n";
}
}
// To be filled in
return 0;
}
This issue has been resolved on the NVIDIA developer forums. The issue is that the Fused Attention fprop
graph pattern is currently only supported on Hopper GPUs. Through logging all information it was shown that the execution plans were failing to finalize for the following reasons:
Which indicate an unsupported graph given the current hardware. To fix this implementation I believe you must split the operation graphs into one for each matrix multiplication.