C++ RDMA原子操作中的错误

C++ RDMA原子操作中的错误,c++,infiniband,rdma,C++,Infiniband,Rdma,我在执行RDMA原子操作(FETCH_ADD和CMP_以及交换)时遇到问题。当我尝试提交原子RDMA请求时,ibv_post_send()函数失败,Errno设置为“Invalid argument”。我对RDMA读/写没有这样的问题 我按如下方式注册内存地址: local_buffer = new uint64_t[1]; // so the memory region is byte-aligned local_mr = ibv_reg_mr(pd, local_buffer, size

我在执行RDMA原子操作(FETCH_ADD和CMP_以及交换)时遇到问题。当我尝试提交原子RDMA请求时,ibv_post_send()函数失败,Errno设置为“Invalid argument”。我对RDMA读/写没有这样的问题

我按如下方式注册内存地址:

local_buffer = new uint64_t[1];   // so the memory region is byte-aligned
local_mr = ibv_reg_mr(pd, local_buffer, sizeof(uint64_t),
    IBV_ACCESS_LOCAL_WRITE
    | IBV_ACCESS_REMOTE_READ
    | IBV_ACCESS_REMOTE_ATOMIC));
memset(qp_attr, 0, sizeof(*qp_attr));
qp_attr->send_cq = s_ctx->cq;
qp_attr->recv_cq = s_ctx->cq;
qp_attr->qp_type = IBV_QPT_RC;
qp_attr->cap.max_send_wr = 10;
qp_attr->cap.max_recv_wr = 10;
qp_attr->cap.max_send_sge = 1;
qp_attr->cap.max_recv_sge = 1;
TEST_NZ(rdma_create_qp(id, s_ctx->pd, qp_attr));
struct ibv_send_wr wr, *bad_wr = NULL;
struct ibv_sge sge;
memset(&sge, 0, sizeof(sge));
sge.addr        = (uintptr_t)conn->local_buffer;
sge.length      = 8;
sge.lkey        = conn->local_mr->lkey;
memset(&wr, 0, sizeof(wr));
wr.wr_id                    = 0;
wr.opcode                   = IBV_WR_ATOMIC_FETCH_AND_ADD;
wr.sg_list                  = &sge;
wr.num_sge                  = 1;
wr.send_flags               = IBV_SEND_SIGNALED;
wr.wr.atomic.remote_addr    = (uintptr_t)conn->peer_mr.addr;
wr.wr.atomic.rkey           = conn->peer_mr.rkey;
wr.wr.atomic.compare_add    = 1ULL; /* value to be added to the remote address content */
if (ibv_post_send(conn->qp, &wr, &bad_wr)) {
    fprintf(stderr, "Error, ibv_post_send() failed\n");
    die("");
}   
我按如下方式构建队列对:

local_buffer = new uint64_t[1];   // so the memory region is byte-aligned
local_mr = ibv_reg_mr(pd, local_buffer, sizeof(uint64_t),
    IBV_ACCESS_LOCAL_WRITE
    | IBV_ACCESS_REMOTE_READ
    | IBV_ACCESS_REMOTE_ATOMIC));
memset(qp_attr, 0, sizeof(*qp_attr));
qp_attr->send_cq = s_ctx->cq;
qp_attr->recv_cq = s_ctx->cq;
qp_attr->qp_type = IBV_QPT_RC;
qp_attr->cap.max_send_wr = 10;
qp_attr->cap.max_recv_wr = 10;
qp_attr->cap.max_send_sge = 1;
qp_attr->cap.max_recv_sge = 1;
TEST_NZ(rdma_create_qp(id, s_ctx->pd, qp_attr));
struct ibv_send_wr wr, *bad_wr = NULL;
struct ibv_sge sge;
memset(&sge, 0, sizeof(sge));
sge.addr        = (uintptr_t)conn->local_buffer;
sge.length      = 8;
sge.lkey        = conn->local_mr->lkey;
memset(&wr, 0, sizeof(wr));
wr.wr_id                    = 0;
wr.opcode                   = IBV_WR_ATOMIC_FETCH_AND_ADD;
wr.sg_list                  = &sge;
wr.num_sge                  = 1;
wr.send_flags               = IBV_SEND_SIGNALED;
wr.wr.atomic.remote_addr    = (uintptr_t)conn->peer_mr.addr;
wr.wr.atomic.rkey           = conn->peer_mr.rkey;
wr.wr.atomic.compare_add    = 1ULL; /* value to be added to the remote address content */
if (ibv_post_send(conn->qp, &wr, &bad_wr)) {
    fprintf(stderr, "Error, ibv_post_send() failed\n");
    die("");
}   
最后提交带有原子操作码的RDMA操作,如下所示:

local_buffer = new uint64_t[1];   // so the memory region is byte-aligned
local_mr = ibv_reg_mr(pd, local_buffer, sizeof(uint64_t),
    IBV_ACCESS_LOCAL_WRITE
    | IBV_ACCESS_REMOTE_READ
    | IBV_ACCESS_REMOTE_ATOMIC));
memset(qp_attr, 0, sizeof(*qp_attr));
qp_attr->send_cq = s_ctx->cq;
qp_attr->recv_cq = s_ctx->cq;
qp_attr->qp_type = IBV_QPT_RC;
qp_attr->cap.max_send_wr = 10;
qp_attr->cap.max_recv_wr = 10;
qp_attr->cap.max_send_sge = 1;
qp_attr->cap.max_recv_sge = 1;
TEST_NZ(rdma_create_qp(id, s_ctx->pd, qp_attr));
struct ibv_send_wr wr, *bad_wr = NULL;
struct ibv_sge sge;
memset(&sge, 0, sizeof(sge));
sge.addr        = (uintptr_t)conn->local_buffer;
sge.length      = 8;
sge.lkey        = conn->local_mr->lkey;
memset(&wr, 0, sizeof(wr));
wr.wr_id                    = 0;
wr.opcode                   = IBV_WR_ATOMIC_FETCH_AND_ADD;
wr.sg_list                  = &sge;
wr.num_sge                  = 1;
wr.send_flags               = IBV_SEND_SIGNALED;
wr.wr.atomic.remote_addr    = (uintptr_t)conn->peer_mr.addr;
wr.wr.atomic.rkey           = conn->peer_mr.rkey;
wr.wr.atomic.compare_add    = 1ULL; /* value to be added to the remote address content */
if (ibv_post_send(conn->qp, &wr, &bad_wr)) {
    fprintf(stderr, "Error, ibv_post_send() failed\n");
    die("");
}   

另外,由于我使用的是librdmacm,INIT和RTR以及RTS之间的队列对的转换是自动完成的,因此我无法使用
ibv\u modify\u qp>手动设置
qp\u attr->qp\u access\u标志
qp\u attr->max\u rd\u原子
。但是,我在libibcm中编写了一个带有原子操作的小代码,并在手动转换队列时设置了这些属性。不过,运气不好

您使用的是哪种类型的IB适配器?您的驱动程序库是libmlx4、libmlx5、…?@Roland运行ibv_devinfo显示供应商的part_id是4113。这是否意味着我的IB适配器是Connect IB?安装的驱动程序库是MLNX_of eed_LINUX-2.3-1.0.0-ubuntu14.04-x86_64。我不确定这是否是您要找的。这并不是我要求的,但这足够让我知道您正在使用libmlx5。您安装了什么版本的libmlx5?查看libmlx5源代码,我发现mlx5_post_send()返回EINVAL的唯一方法是使用无效操作码。那么您确定MLNX_OFED版本支持康涅狄格州的原子操作吗?(您可以向您的Mellanox支持人员咨询)@Roland我会向Mellanox咨询,但我想atomic是受支持的,因为我能够毫无错误地运行用于原子操作的Perftest基准测试工具。我可以请你帮个忙吗?您是否碰巧拥有或知道使用原子操作的小型自包含代码?这可能会使发现bug变得更容易。@Roland它的代码库相当大。我查看了一下,但是迷路了。这就是为什么我要找更小的