Search code examples
cethernetinfinibandmellanox

How can I receive Ethernet frames with ibverbs?


I want to write a simple test program to receive Ethernet frames using the ibverbs API.

The code below compiles and runs but never receives any packets. I'm using Mellanox ConnectX-3 hardware on Ubuntu 18.

Questions:

  1. If, while running this RX program, I ping the Inifiniband interface from another machine, then ping receives responses. I would not expect that because the ping requests should be grabbed by the RX program and the Linux IP stack should not see them and therefore should not respond. What should happen?

  2. Is there anything obvious wrong with my code?

  3. Do I need a steering rule? If I remove the call of ibv_create_flow() should I just receive all packets the interface sees?

#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>


#define PORT_NUM 1

#define MAX_MSG_SIZE 1500 // The maximum size of each received packet.
#define RQ_NUM_DESC 512 // Max packets that can be received without processing.

// The MAC of the interface we are listening on.
#define DEST_MAC { 0x00, 0x0d, 0x3a, 0x47, 0x1c, 0x2e }

#define FATAL_ERROR(msg, ...) { fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); exit(-1); }


int main() {
    // Get the list of devices.
    int num_devices = 0;
    struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
    if (!dev_list)
        FATAL_ERROR("Failed to get IB devices list.");

    // Choose the first device.
    struct ibv_device *ib_dev = dev_list[0];
    if (!ib_dev)
        FATAL_ERROR("IB device not found.");
    printf("Found %i Infiniband device(s).\n", num_devices);
    printf("Using device '%s'.\n", ibv_get_device_name(ib_dev));

    // Get the device context.
    struct ibv_context *context = ibv_open_device(ib_dev);
    if (!context)
        FATAL_ERROR("Couldn't get context for device.");

    // Allocate a protection domain (PD) that will group memory
    // regions (MR) and rings.
    struct ibv_pd *pd = ibv_alloc_pd(context);
    if (!pd)
        FATAL_ERROR("Couldn't allocate protection domain.");

    // Create Complition Queue (CQ).
    struct ibv_cq *cq = ibv_create_cq(context, RQ_NUM_DESC, NULL, NULL, 0);
    if (!cq)
        FATAL_ERROR("Couldn't create completion queue. errno = %d.", errno);

    // Create Queue Pair (QP).
    struct ibv_qp_init_attr qp_init_attr = {
        .qp_context = NULL,
        .send_cq = cq, // Report receive completion to CQ.
        .recv_cq = cq,

        .cap = {
            .max_send_wr = 0, // No send ring.
            .max_recv_wr = RQ_NUM_DESC, // Max num packets in ring.
            .max_recv_sge = 1, // Only one pointer per descriptor.
         },
        .qp_type = IBV_QPT_RAW_PACKET, // Use Ethernet packets.
    };
    struct ibv_qp *qp = ibv_create_qp(pd, &qp_init_attr);
    if (!qp)
        FATAL_ERROR("Couldn't create queue pair.");

    // Initialize the QP (receive ring) and assign a port.
    struct ibv_qp_attr qp_attr = { 0 };
    qp_attr.qp_state = IBV_QPS_INIT;
    qp_attr.port_num = PORT_NUM;
    int qp_flags = IBV_QP_STATE | IBV_QP_PORT;
    if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
        FATAL_ERROR("Failed to initialize queue pair.");

    // Move ring state to ready-to-receive. This is needed in
    // order to be able to receive packets.
    memset(&qp_attr, 0, sizeof(qp_attr));
    qp_flags = IBV_QP_STATE;
    qp_attr.qp_state = IBV_QPS_RTR;
    if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
        FATAL_ERROR("Failed to put queue pair into ready-to-receive state.");

    // Allocate memory for packet buffer.
    int buf_size = MAX_MSG_SIZE * RQ_NUM_DESC; // Maximum size of data to be accessed by hardware.
    void *buf = malloc(buf_size);
    if (!buf)
        FATAL_ERROR("Couldn't allocate memory.");

    // Register the user memory so it can be accessed by the HW directly.
    struct ibv_mr *mr = ibv_reg_mr(pd, buf, buf_size, IBV_ACCESS_LOCAL_WRITE);
    if (!mr)
        FATAL_ERROR("Couldn't register memory region.");

    // Create a scatter/gather entry.
    struct ibv_sge sg_entry;
    sg_entry.length = MAX_MSG_SIZE;
    sg_entry.lkey = mr->lkey;

    // Create a receive work request.
    struct ibv_recv_wr wr;
    wr.num_sge = 1;
    wr.sg_list = &sg_entry;
    wr.next = NULL;

    // Post a load of receive work requests onto the receive queue.
    struct ibv_recv_wr *bad_wr;
    for (int n = 0; n < RQ_NUM_DESC; n++) {
        // Each descriptor points to max MTU size buffer.
        sg_entry.addr = (uint64_t)buf + MAX_MSG_SIZE * n;

        // When a packet is received, a work completion will be created
        // corresponding to this work request. It will contain this field.
        wr.wr_id = n;

        // Post the receive buffer to the ring.
        int rv = ibv_post_recv(qp, &wr, &bad_wr);
        if (rv != 0) {
            FATAL_ERROR("Posting recv failed with error code %i.", rv);
        }
    }

    // Create steering rule.
    struct raw_eth_flow_attr {
        struct ibv_flow_attr attr;
        struct ibv_flow_spec_eth spec_eth;
    } __attribute__((packed)) flow_attr = {
        .attr = {
            .comp_mask = 0,
            .type = IBV_FLOW_ATTR_NORMAL,
            .size = sizeof(flow_attr),
            .priority = 0,
            .num_of_specs = 1,
            .port = PORT_NUM,
            .flags = 0,
        },
        .spec_eth = {
            .type = IBV_FLOW_SPEC_ETH,
            .size = sizeof(struct ibv_flow_spec_eth),
            .val = {
                .dst_mac = DEST_MAC,
                .src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
                .ether_type = 0,
                .vlan_tag = 0,
            },
            .mask = {
                .dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
                .src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
                .ether_type = 0,
                .vlan_tag = 0,
            }
        }
    };

    // Register steering rule to intercept packet to DEST_MAC and place packet in
    // ring pointed by qp.
    struct ibv_flow *eth_flow = ibv_create_flow(qp, &flow_attr.attr);
    if (!eth_flow)
        FATAL_ERROR("Couldn't attach steering flow. Does DEST_MAC match that of the local NIC?");

    printf("Receiving.\n");
    while (1) {
        // Wait for CQ event upon message received, and print a message
        struct ibv_wc wc;
        int msgs_completed = ibv_poll_cq(cq, 1, &wc);
        if (msgs_completed > 0) {
            printf("Message %ld received size %d\n", wc.wr_id, wc.byte_len);
            sg_entry.addr = (uint64_t)buf + wc.wr_id * MAX_MSG_SIZE;
            wr.wr_id = wc.wr_id;

            // After processed need to post back the buffer.
            int rv = ibv_post_recv(qp, &wr, &bad_wr);
            if (rv != 0) {
                FATAL_ERROR("Re-posting recv failed with error code %i.", rv);
            }
        }
        else if (msgs_completed < 0) {
            FATAL_ERROR("Polling error.");
        }
    }
}

Solution

  • Take a look at this example from Nvidia: https://enterprise-support.nvidia.com/s/article/raw-ethernet-programming--basic-introduction---code-example

    To receive everything the interface sees, you can use the experimental api #include <infiniband/verbs_exp.h>, then when creating the steering rule, use ibv_exp_flow_attr and set the type to IBV_EXP_FLOW_ATTR_SNIFFER.