📄 verbs.c
字号:
/* * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the BSD-type * license below: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of the Network Appliance, Inc. nor the names of * its contributors may be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *//* * verbs.c * * Encapsulates the major functions managing: * o adapters * o endpoints * o connections * o buffer memory */#include <linux/pci.h> /* for Tavor hack below */#include "xprt_rdma.h"/* * Globals/Macros */#ifdef RPC_DEBUG# define RPCDBG_FACILITY RPCDBG_TRANS#endif/* * internal functions *//* * handle replies in tasklet context, using a single, global list * rdma tasklet function -- just turn around and call the func * for all replies on the list */static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);static LIST_HEAD(rpcrdma_tasklets_g);static voidrpcrdma_run_tasklet(unsigned long data){ struct rpcrdma_rep *rep; void (*func)(struct rpcrdma_rep *); unsigned long flags; data = data; spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); while (!list_empty(&rpcrdma_tasklets_g)) { rep = list_entry(rpcrdma_tasklets_g.next, struct rpcrdma_rep, rr_list); list_del(&rep->rr_list); func = rep->rr_func; rep->rr_func = NULL; spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); if (func) func(rep); else rpcrdma_recv_buffer_put(rep); spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); } spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);}static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);static inline voidrpcrdma_schedule_tasklet(struct rpcrdma_rep *rep){ unsigned long flags; spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); tasklet_schedule(&rpcrdma_tasklet_g);}static voidrpcrdma_qp_async_error_upcall(struct ib_event *event, void *context){ struct rpcrdma_ep *ep = context; dprintk("RPC: %s: QP error %X on device %s ep %p\n", __func__, event->event, event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); }}static voidrpcrdma_cq_async_error_upcall(struct ib_event *event, void *context){ struct rpcrdma_ep *ep = context; dprintk("RPC: %s: CQ error %X on device %s ep %p\n", __func__, event->event, event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); }}static inlinevoid rpcrdma_event_process(struct ib_wc *wc){ struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long) wc->wr_id; dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", __func__, rep, wc->status, wc->opcode, wc->byte_len); if (!rep) /* send or bind completion that we don't care about */ return; if (IB_WC_SUCCESS != wc->status) { dprintk("RPC: %s: %s WC status %X, connection lost\n", __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", wc->status); rep->rr_len = ~0U; rpcrdma_schedule_tasklet(rep); return; } switch (wc->opcode) { case IB_WC_RECV: rep->rr_len = wc->byte_len; ib_dma_sync_single_for_cpu( rdmab_to_ia(rep->rr_buffer)->ri_id->device, rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); /* Keep (only) the most recent credits, after check validity */ if (rep->rr_len >= 16) { struct rpcrdma_msg *p = (struct rpcrdma_msg *) rep->rr_base; unsigned int credits = ntohl(p->rm_credit); if (credits == 0) { dprintk("RPC: %s: server" " dropped credits to 0!\n", __func__); /* don't deadlock */ credits = 1; } else if (credits > rep->rr_buffer->rb_max_requests) { dprintk("RPC: %s: server" " over-crediting: %d (%d)\n", __func__, credits, rep->rr_buffer->rb_max_requests); credits = rep->rr_buffer->rb_max_requests; } atomic_set(&rep->rr_buffer->rb_credits, credits); } /* fall through */ case IB_WC_BIND_MW: rpcrdma_schedule_tasklet(rep); break; default: dprintk("RPC: %s: unexpected WC event %X\n", __func__, wc->opcode); break; }}static inline intrpcrdma_cq_poll(struct ib_cq *cq){ struct ib_wc wc; int rc; for (;;) { rc = ib_poll_cq(cq, 1, &wc); if (rc < 0) { dprintk("RPC: %s: ib_poll_cq failed %i\n", __func__, rc); return rc; } if (rc == 0) break; rpcrdma_event_process(&wc); } return 0;}/* * rpcrdma_cq_event_upcall * * This upcall handles recv, send, bind and unbind events. * It is reentrant but processes single events in order to maintain * ordering of receives to keep server credits. * * It is the responsibility of the scheduled tasklet to return * recv buffers to the pool. NOTE: this affects synchronization of * connection shutdown. That is, the structures required for * the completion of the reply handler must remain intact until * all memory has been reclaimed. * * Note that send events are suppressed and do not result in an upcall. */static voidrpcrdma_cq_event_upcall(struct ib_cq *cq, void *context){ int rc; rc = rpcrdma_cq_poll(cq); if (rc) return; rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); if (rc) { dprintk("RPC: %s: ib_req_notify_cq failed %i\n", __func__, rc); return; } rpcrdma_cq_poll(cq);}#ifdef RPC_DEBUGstatic const char * const conn[] = { "address resolved", "address error", "route resolved", "route error", "connect request", "connect response", "connect error", "unreachable", "rejected", "established", "disconnected", "device removal"};#endifstatic intrpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event){ struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; struct ib_qp_attr attr; struct ib_qp_init_attr iattr; int connstate = 0; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EHOSTUNREACH; dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; ib_query_qp(ia->ri_id->qp, &attr, IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, &iattr); dprintk("RPC: %s: %d responder resources" " (%d initiator)\n", __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; goto connected; case RDMA_CM_EVENT_UNREACHABLE: connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: connstate = -ECONNREFUSED; goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; goto connected; case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV;connected: dprintk("RPC: %s: %s: %u.%u.%u.%u:%u" " (ep 0x%p event 0x%x)\n", __func__, (event->event <= 11) ? conn[event->event] : "unknown connection error", NIPQUAD(addr->sin_addr.s_addr), ntohs(addr->sin_port), ep, event->event); atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); break; default: ia->ri_async_rc = -EINVAL; dprintk("RPC: %s: unexpected CM event %X\n", __func__, event->event); complete(&ia->ri_done); break; } return 0;}static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr){ struct rdma_cm_id *id; int rc; id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); dprintk("RPC: %s: rdma_create_id() failed %i\n", __func__, rc); return id; } ia->ri_async_rc = 0; rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); goto out; } wait_for_completion(&ia->ri_done); rc = ia->ri_async_rc; if (rc) goto out; ia->ri_async_rc = 0; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); goto out; } wait_for_completion(&ia->ri_done); rc = ia->ri_async_rc; if (rc) goto out; return id;out: rdma_destroy_id(id); return ERR_PTR(rc);}/* * Drain any cq, prior to teardown. */static voidrpcrdma_clean_cq(struct ib_cq *cq){ struct ib_wc wc; int count = 0; while (1 == ib_poll_cq(cq, 1, &wc)) ++count; if (count) dprintk("RPC: %s: flushed %d events (last 0x%x)\n", __func__, count, wc.opcode);}/* * Exported functions. *//* * Open and initialize an Interface Adapter. * o initializes fields of struct rpcrdma_ia, including * interface and provider attributes and protection zone. */intrpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg){ int rc; struct rpcrdma_ia *ia = &xprt->rx_ia; init_completion(&ia->ri_done); ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); goto out1; } ia->ri_pd = ib_alloc_pd(ia->ri_id->device); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); dprintk("RPC: %s: ib_alloc_pd() failed %i\n", __func__, rc); goto out2; } /* * Optionally obtain an underlying physical identity mapping in * order to do a memory window-based bind. This base registration * is protected from remote access - that is enabled only by binding * for the specific bytes targeted during each RPC operation, and * revoked after the corresponding completion similar to a storage * adapter. */ if (memreg > RPCRDMA_REGISTER) { int mem_priv = IB_ACCESS_LOCAL_WRITE; switch (memreg) {#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: mem_priv |= IB_ACCESS_REMOTE_WRITE; mem_priv |= IB_ACCESS_REMOTE_READ; break;#endif case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: mem_priv |= IB_ACCESS_MW_BIND; break; default: break; } ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " "phys register failed with %lX\n\t" "Will continue with degraded performance\n", __func__, PTR_ERR(ia->ri_bind_mem)); memreg = RPCRDMA_REGISTER; ia->ri_bind_mem = NULL; } } /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; return 0;out2: rdma_destroy_id(ia->ri_id);out1: return rc;}/* * Clean up/close an IA. * o if event handles and PD have been initialized, free them. * o close the IA */voidrpcrdma_ia_close(struct rpcrdma_ia *ia){ int rc; dprintk("RPC: %s: entering\n", __func__); if (ia->ri_bind_mem != NULL) { rc = ib_dereg_mr(ia->ri_bind_mem); dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { rc = ib_dealloc_pd(ia->ri_pd); dprintk("RPC: %s: ib_dealloc_pd returned %i\n", __func__, rc); } if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) rdma_destroy_id(ia->ri_id);}/* * Create unconnected endpoint. */intrpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata){ struct ib_device_attr devattr; int rc; rc = ib_query_device(ia->ri_id->device, &devattr); if (rc) { dprintk("RPC: %s: ib_query_device failed %d\n", __func__, rc); return rc; } /* check provider's send/recv wr limits */ if (cdata->max_requests > devattr.max_qp_wr) cdata->max_requests = devattr.max_qp_wr; ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -