📄 verbs.c
字号:
case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ ep->rep_attr.cap.max_send_wr++; ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) return -EINVAL; break; default: break; } ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ep->rep_attr.qp_type = IB_QPT_RC; ep->rep_attr.port_num = ~0; dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, ep->rep_attr.cap.max_send_wr, ep->rep_attr.cap.max_recv_wr, ep->rep_attr.cap.max_send_sge, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; switch (ia->ri_memreg_strategy) { case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: ep->rep_cqinit -= RPCRDMA_MAX_SEGS; break; default: break; } if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); ep->rep_ia = ia; init_waitqueue_head(&ep->rep_connect_wait); /* * Create a single cq for receive dto and mw_bind (only ever * care about unbind, really). Send completions are suppressed. * Use single threaded tasklet upcalls to maintain ordering. */ ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, rpcrdma_cq_async_error_upcall, NULL, ep->rep_attr.cap.max_recv_wr + ep->rep_attr.cap.max_send_wr + 1, 0); if (IS_ERR(ep->rep_cq)) { rc = PTR_ERR(ep->rep_cq); dprintk("RPC: %s: ib_create_cq failed: %i\n", __func__, rc); goto out1; } rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); if (rc) { dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); goto out2; } ep->rep_attr.send_cq = ep->rep_cq; ep->rep_attr.recv_cq = ep->rep_cq; /* Initialize cma parameters */ /* RPC/RDMA does not use private data */ ep->rep_remote_cma.private_data = NULL; ep->rep_remote_cma.private_data_len = 0; /* Client offers RDMA Read but does not initiate */ switch (ia->ri_memreg_strategy) { case RPCRDMA_BOUNCEBUFFERS: ep->rep_remote_cma.responder_resources = 0; break; case RPCRDMA_MTHCAFMR: case RPCRDMA_REGISTER: ep->rep_remote_cma.responder_resources = cdata->max_requests * (RPCRDMA_MAX_DATA_SEGS / 8); break; case RPCRDMA_MEMWINDOWS: case RPCRDMA_MEMWINDOWS_ASYNC:#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL:#endif ep->rep_remote_cma.responder_resources = cdata->max_requests * (RPCRDMA_MAX_DATA_SEGS / 2); break; default: break; } if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; ep->rep_remote_cma.rnr_retry_count = 0; return 0;out2: if (ib_destroy_cq(ep->rep_cq)) ;out1: return rc;}/* * rpcrdma_ep_destroy * * Disconnect and destroy endpoint. After this, the only * valid operations on the ep are to free it (if dynamically * allocated) or re-create it. * * The caller's error handling must be sure to not leak the endpoint * if this function fails. */intrpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia){ int rc; dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); if (ia->ri_id->qp) { rc = rpcrdma_ep_disconnect(ep, ia); if (rc) dprintk("RPC: %s: rpcrdma_ep_disconnect" " returned %i\n", __func__, rc); } ep->rep_func = NULL; /* padding - could be done in rpcrdma_buffer_destroy... */ if (ep->rep_pad_mr) { rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); ep->rep_pad_mr = NULL; } if (ia->ri_id->qp) { rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } rpcrdma_clean_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); return rc;}/* * Connect unconnected endpoint. */intrpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia){ struct rdma_cm_id *id; int rc = 0; int retry_count = 0; int reconnect = (ep->rep_connected != 0); if (reconnect) { struct rpcrdma_xprt *xprt;retry: rc = rpcrdma_ep_disconnect(ep, ia); if (rc && rc != -ENOTCONN) dprintk("RPC: %s: rpcrdma_ep_disconnect" " status %i\n", __func__, rc); rpcrdma_clean_cq(ep->rep_cq); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { rc = PTR_ERR(id); goto out; } /* TEMP TEMP TEMP - fail if new device: * Deregister/remarshal *all* requests! * Close and recreate adapter, pd, etc! * Re-determine all attributes still sane! * More stuff I haven't thought of! * Rrrgh! */ if (ia->ri_id->device != id->device) { printk("RPC: %s: can't reconnect on " "different device!\n", __func__); rdma_destroy_id(id); rc = -ENETDOWN; goto out; } /* END TEMP */ rdma_destroy_id(ia->ri_id); ia->ri_id = id; } rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); goto out; }/* XXX Tavor device performs badly with 2K MTU! */if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && (pcid->vendor == PCI_VENDOR_ID_MELLANOX || pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { struct ib_qp_attr attr = { .path_mtu = IB_MTU_1024 }; rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); }} /* Theoretically a client initiator_depth > 0 is not needed, * but many peers fail to complete the connection unless they * == responder_resources! */ if (ep->rep_remote_cma.initiator_depth != ep->rep_remote_cma.responder_resources) ep->rep_remote_cma.initiator_depth = ep->rep_remote_cma.responder_resources; ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); if (rc) { dprintk("RPC: %s: rdma_connect() failed with %i\n", __func__, rc); goto out; } if (reconnect) return 0; wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); /* * Check state. A non-peer reject indicates no listener * (ECONNREFUSED), which may be a transient state. All * others indicate a transport condition which has already * undergone a best-effort. */ if (ep->rep_connected == -ECONNREFUSED && ++retry_count <= RDMA_CONNECT_RETRY_MAX) { dprintk("RPC: %s: non-peer_reject, retry\n", __func__); goto retry; } if (ep->rep_connected <= 0) { /* Sometimes, the only way to reliably connect to remote * CMs is to use same nonzero values for ORD and IRD. */ ep->rep_remote_cma.initiator_depth = ep->rep_remote_cma.responder_resources; if (ep->rep_remote_cma.initiator_depth == 0) ++ep->rep_remote_cma.initiator_depth; if (ep->rep_remote_cma.responder_resources == 0) ++ep->rep_remote_cma.responder_resources; if (retry_count++ == 0) goto retry; rc = ep->rep_connected; } else { dprintk("RPC: %s: connected\n", __func__); }out: if (rc) ep->rep_connected = rc; return rc;}/* * rpcrdma_ep_disconnect * * This is separate from destroy to facilitate the ability * to reconnect without recreating the endpoint. * * This call is not reentrant, and must not be made in parallel * on the same endpoint. */intrpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia){ int rc; rpcrdma_clean_cq(ep->rep_cq); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); dprintk("RPC: %s: after wait, %sconnected\n", __func__, (ep->rep_connected == 1) ? "still " : "dis"); } else { dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } return rc;}/* * Initialize buffer memory */intrpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata){ char *p; size_t len; int i, rc; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); atomic_set(&buf->rb_credits, 1); /* Need to allocate: * 1. arrays for send and recv pointers * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any * 5. mw's, if any * Send/recv buffers in req/rep need to be registered */ len = buf->rb_max_requests * (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; switch (ia->ri_memreg_strategy) { case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * sizeof(struct rpcrdma_mw); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * sizeof(struct rpcrdma_mw); break; default: break; } /* allocate 1, 4 and 5 in one shot */ p = kzalloc(len, GFP_KERNEL); if (p == NULL) { dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", __func__, len); rc = -ENOMEM; goto out; } buf->rb_pool = p; /* for freeing it later */ buf->rb_send_bufs = (struct rpcrdma_req **) p; p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; /* * Register the zeroed pad buffer, if any. */ if (cdata->padding) { rc = rpcrdma_register_internal(ia, p, cdata->padding, &ep->rep_pad_mr, &ep->rep_pad); if (rc) goto out; } p += cdata->padding; /* * Allocate the fmr's, or mw's for mw_bind chunk registration. * We "cycle" the mw's in order to minimize rkey reuse, * and also reduce unbind-to-bind collision. */ INIT_LIST_HEAD(&buf->rb_mws); switch (ia->ri_memreg_strategy) { case RPCRDMA_MTHCAFMR: { struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; struct ib_fmr_attr fa = { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.fmr = ib_alloc_fmr(ia->ri_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, &fa); if (IS_ERR(r->r.fmr)) { rc = PTR_ERR(r->r.fmr); dprintk("RPC: %s: ib_alloc_fmr" " failed %i\n", __func__, rc); goto out; } list_add(&r->mw_list, &buf->rb_mws); ++r; } } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: { struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.mw = ib_alloc_mw(ia->ri_pd); if (IS_ERR(r->r.mw)) { rc = PTR_ERR(r->r.mw); dprintk("RPC: %s: ib_alloc_mw" " failed %i\n", __func__, rc); goto out; } list_add(&r->mw_list, &buf->rb_mws); ++r; } } break; default: break; } /* * Allocate/init the request/reply buffers. Doing this * using kmalloc for now -- one for each buf. */ for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; struct rpcrdma_rep *rep; len = cdata->inline_wsize + sizeof(struct rpcrdma_req); /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ /* Typical ~2400b, so rounding up saves work later */ if (len < 4096) len = 4096; req = kmalloc(len, GFP_KERNEL); if (req == NULL) { dprintk("RPC: %s: request buffer %d alloc" " failed\n", __func__, i); rc = -ENOMEM; goto out; } memset(req, 0, sizeof(struct rpcrdma_req)); buf->rb_send_bufs[i] = req; buf->rb_send_bufs[i]->rl_buffer = buf; rc = rpcrdma_register_internal(ia, req->rl_base, len - offsetof(struct rpcrdma_req, rl_base), &buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); if (rc) goto out; buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); rep = kmalloc(len, GFP_KERNEL); if (rep == NULL) { dprintk("RPC: %s: reply buffer %d alloc failed\n", __func__, i); rc = -ENOMEM; goto out; } memset(rep, 0, sizeof(struct rpcrdma_rep)); buf->rb_recv_bufs[i] = rep; buf->rb_recv_bufs[i]->rr_buffer = buf; init_waitqueue_head(&rep->rr_unbind); rc = rpcrdma_register_internal(ia, rep->rr_base, len - offsetof(struct rpcrdma_rep, rr_base), &buf->rb_recv_bufs[i]->rr_handle, &buf->rb_recv_bufs[i]->rr_iov); if (rc) goto out; } dprintk("RPC: %s: max_requests %d\n", __func__, buf->rb_max_requests); /* done */ return 0;out: rpcrdma_buffer_destroy(buf); return rc;}/* * Unregister and destroy buffer memory. Need to deal with * partial initialization, so it's callable from failed create. * Must be called before destroying endpoint, as registrations * reference it. */voidrpcrdma_buffer_destroy(struct rpcrdma_buffer *buf){ int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) * 1a. bind mw memory * 2. send mr memory (mr free, then kfree) * 3. padding (if any) [moved to rpcrdma_ep_destroy] * 4. arrays */ dprintk("RPC: %s: entering\n", __func__); for (i = 0; i < buf->rb_max_requests; i++) { if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { rpcrdma_deregister_internal(ia, buf->rb_recv_bufs[i]->rr_handle, &buf->rb_recv_bufs[i]->rr_iov); kfree(buf->rb_recv_bufs[i]); } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { while (!list_empty(&buf->rb_mws)) { struct rpcrdma_mw *r; r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { case RPCRDMA_MTHCAFMR: rc = ib_dealloc_fmr(r->r.fmr); if (rc) dprintk("RPC: %s:" " ib_dealloc_fmr" " failed %i\n", __func__, rc); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: rc = ib_dealloc_mw(r->r.mw); if (rc) dprintk("RPC: %s:" " ib_dealloc_mw" " failed %i\n", __func__, rc);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -