📄 ipoib_cm.c
字号:
/* * Copyright (c) 2006 Mellanox Technologies. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */#include <rdma/ib_cm.h>#include <rdma/ib_cache.h>#include <net/dst.h>#include <net/icmp.h>#include <linux/icmpv6.h>#include <linux/delay.h>#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATAstatic int data_debug_level;module_param_named(cm_data_debug_level, data_debug_level, int, 0644);MODULE_PARM_DESC(cm_data_debug_level, "Enable data path debug tracing for connected mode if > 0");#endif#include "ipoib.h"#define IPOIB_CM_IETF_ID 0x1000000000000000ULL#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)#define IPOIB_CM_RX_DELAY (3 * 256 * HZ)#define IPOIB_CM_RX_UPDATE_MASK (0x3)static struct ib_qp_attr ipoib_cm_err_attr = { .qp_state = IB_QPS_ERR};#define IPOIB_CM_RX_DRAIN_WRID 0xffffffffstatic struct ib_send_wr ipoib_cm_rx_drain_wr = { .wr_id = IPOIB_CM_RX_DRAIN_WRID, .opcode = IB_WR_SEND,};static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, u64 mapping[IPOIB_CM_RX_SG]){ int i; ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (i = 0; i < frags; ++i) ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);}static int ipoib_cm_post_receive(struct net_device *dev, int id){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_recv_wr *bad_wr; int i, ret; priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; for (i = 0; i < IPOIB_CM_RX_SG; ++i) priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, priv->cm.srq_ring[id].mapping); dev_kfree_skb_any(priv->cm.srq_ring[id].skb); priv->cm.srq_ring[id].skb = NULL; } return ret;}static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags, u64 mapping[IPOIB_CM_RX_SG]){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; int i; skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); if (unlikely(!skb)) return NULL; /* * IPoIB adds a 4 byte header. So we need 12 more bytes to align the * IP header to a multiple of 16. */ skb_reserve(skb, 12); mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { dev_kfree_skb_any(skb); return NULL; } for (i = 0; i < frags; i++) { struct page *page = alloc_page(GFP_ATOMIC); if (!page) goto partial_error; skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, 0, PAGE_SIZE, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) goto partial_error; } priv->cm.srq_ring[id].skb = skb; return skb;partial_error: ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (; i > 0; --i) ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); return NULL;}static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv){ struct ib_send_wr *bad_wr; struct ipoib_cm_rx *p; /* We only reserved 1 extra slot in CQ for drain WRs, so * make sure we have at most 1 outstanding WR. */ if (list_empty(&priv->cm.rx_flush_list) || !list_empty(&priv->cm.rx_drain_list)) return; /* * QPs on flush list are error state. This way, a "flush * error" WC will be immediately generated for each WR we post. */ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) ipoib_warn(priv, "failed to post drain wr\n"); list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);}static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx){ struct ipoib_cm_rx *p = ctx; struct ipoib_dev_priv *priv = netdev_priv(p->dev); unsigned long flags; if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) return; spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_flush_list); p->state = IPOIB_CM_RX_FLUSH; ipoib_cm_start_rx_drain(priv); spin_unlock_irqrestore(&priv->lock, flags);}static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, struct ipoib_cm_rx *p){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { .event_handler = ipoib_cm_rx_event_handler, .send_cq = priv->cq, /* For drain WR */ .recv_cq = priv->cq, .srq = priv->cm.srq, .cap.max_send_wr = 1, /* For drain WR */ .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = p, }; return ib_create_qp(priv->pd, &attr);}static int ipoib_cm_modify_rx_qp(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp, unsigned psn){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); return ret; } ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); return ret; } qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); return ret; } qp_attr.rq_psn = psn; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } /* * Current Mellanox HCA firmware won't generate completions * with error for drain WRs unless the QP has been moved to * RTS first. This work-around leaves a window where a QP has * moved to error asynchronously, but this will eventually get * fixed in firmware, so let's not error out if modify QP * fails. */ qp_attr.qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); return 0; } ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); return 0; } return 0;}static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp, struct ib_cm_req_event_param *req, unsigned psn){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_data data = {}; struct ib_cm_rep_param rep = {}; data.qpn = cpu_to_be32(priv->qp->qp_num); data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); rep.private_data = &data; rep.private_data_len = sizeof data; rep.flow_control = 0; rep.rnr_retry_count = req->rnr_retry_count; rep.srq = 1; rep.qp_num = qp->qp_num; rep.starting_psn = psn; return ib_send_cm_rep(cm_id, &rep);}static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event){ struct net_device *dev = cm_id->context; struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_rx *p; unsigned psn; int ret; ipoib_dbg(priv, "REQ arrived\n"); p = kzalloc(sizeof *p, GFP_KERNEL); if (!p) return -ENOMEM; p->dev = dev; p->id = cm_id; cm_id->context = p; p->state = IPOIB_CM_RX_LIVE; p->jiffies = jiffies; INIT_LIST_HEAD(&p->list); p->qp = ipoib_cm_create_rx_qp(dev, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); goto err_qp; } psn = random32() & 0xffffff; ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); if (ret) goto err_modify; spin_lock_irq(&priv->lock); queue_delayed_work(ipoib_workqueue, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ p->jiffies = jiffies; if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irq(&priv->lock); ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); if (ret) { ipoib_warn(priv, "failed to send REP: %d\n", ret); if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); } return 0;err_modify: ib_destroy_qp(p->qp);err_qp: kfree(p); return ret;}static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event){ struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; switch (event->event) { case IB_CM_REQ_RECEIVED: return ipoib_cm_req_handler(cm_id, event); case IB_CM_DREQ_RECEIVED: p = cm_id->context; ib_send_cm_drep(cm_id, NULL, 0); /* Fall through */ case IB_CM_REJ_RECEIVED: p = cm_id->context; priv = netdev_priv(p->dev); if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); /* Fall through */ default: return 0; }}/* Adjust length of skb with fragments to match received data */static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, unsigned int length, struct sk_buff *toskb){ int i, num_frags; unsigned int size; /* put header into skb */ size = min(length, hdr_space); skb->tail += size; skb->len += size; length -= size; num_frags = skb_shinfo(skb)->nr_frags; for (i = 0; i < num_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (length == 0) { /* don't need this page */ skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); frag->size = size; skb->data_len += size; skb->truesize += size; skb->len += size; length -= size; } }}void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc){ struct ipoib_dev_priv *priv = netdev_priv(dev); unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct sk_buff *skb, *newskb; struct ipoib_cm_rx *p; unsigned long flags; u64 mapping[IPOIB_CM_RX_SG]; int frags; ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); spin_unlock_irqrestore(&priv->lock, flags); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); return; } skb = priv->cm.srq_ring[wr_id].skb; if (unlikely(wc->status != IB_WC_SUCCESS)) { ipoib_dbg(priv, "cm recv error " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); ++dev->stats.rx_dropped; goto repost; } if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { p = wc->qp->qp_context; if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { spin_lock_irqsave(&priv->lock, flags); p->jiffies = jiffies; /* Move this entry to list head, but do not re-add it * if it has been moved out of list. */ if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irqrestore(&priv->lock, flags); } } frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -