📄 ipoib_cm.c
字号:
newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); ++dev->stats.rx_dropped; goto repost; } ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping); memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); dev->last_rx = jiffies; ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; skb->dev = dev; /* XXX get correct PACKET_ type here */ skb->pkt_type = PACKET_HOST; netif_receive_skb(skb);repost: if (unlikely(ipoib_cm_post_receive(dev, wr_id))) ipoib_warn(priv, "ipoib_cm_post_receive failed " "for buf %d\n", wr_id);}static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx, unsigned int wr_id, u64 addr, int len){ struct ib_send_wr *bad_wr; priv->tx_sge.addr = addr; priv->tx_sge.length = len; priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);}void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; u64 addr; if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", skb->len, tx->mtu); ++dev->stats.tx_dropped; ++dev->stats.tx_errors; ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); return; } ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", tx->tx_head, skb->len, tx->qp->qp_num); /* * We put the skb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; tx_req->skb = skb; addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { ++dev->stats.tx_errors; dev_kfree_skb_any(skb); return; } tx_req->mapping = addr; if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), addr, skb->len))) { ipoib_warn(priv, "post_send failed\n"); ++dev->stats.tx_errors; ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); dev_kfree_skb_any(skb); } else { dev->trans_start = jiffies; ++tx->tx_head; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); netif_stop_queue(dev); } }}void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ipoib_tx_buf *tx_req; unsigned long flags; ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_sendq_size)) { ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", wr_id, ipoib_sendq_size); return; } tx_req = &tx->tx_ring[wr_id]; ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); /* FIXME: is this right? Shouldn't we only increment on success? */ ++dev->stats.tx_packets; dev->stats.tx_bytes += tx_req->skb->len; dev_kfree_skb_any(tx_req->skb); spin_lock_irqsave(&priv->tx_lock, flags); ++tx->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && netif_queue_stopped(dev) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_neigh *neigh; ipoib_dbg(priv, "failed cm send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); spin_lock(&priv->lock); neigh = tx->neigh; if (neigh) { neigh->cm = NULL; list_del(&neigh->list); if (neigh->ah) ipoib_put_ah(neigh->ah); ipoib_neigh_free(dev, neigh); tx->neigh = NULL; } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); spin_unlock(&priv->lock); } spin_unlock_irqrestore(&priv->tx_lock, flags);}int ipoib_cm_dev_open(struct net_device *dev){ struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) return 0; priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); if (IS_ERR(priv->cm.id)) { printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); goto err_cm; } ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0, NULL); if (ret) { printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, IPOIB_CM_IETF_ID | priv->qp->qp_num); goto err_listen; } return 0;err_listen: ib_destroy_cm_id(priv->cm.id);err_cm: priv->cm.id = NULL; return ret;}void ipoib_cm_dev_stop(struct net_device *dev){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_rx *p, *n; unsigned long begin; LIST_HEAD(list); int ret; if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) return; ib_destroy_cm_id(priv->cm.id); priv->cm.id = NULL; spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); list_move(&p->list, &priv->cm.rx_error_list); p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); if (ret) ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } /* Wait for all RX to be drained */ begin = jiffies; while (!list_empty(&priv->cm.rx_error_list) || !list_empty(&priv->cm.rx_flush_list) || !list_empty(&priv->cm.rx_drain_list)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "RX drain timing out\n"); /* * assume the HW is wedged and just free up everything. */ list_splice_init(&priv->cm.rx_flush_list, &list); list_splice_init(&priv->cm.rx_error_list, &list); list_splice_init(&priv->cm.rx_drain_list, &list); break; } spin_unlock_irq(&priv->lock); msleep(1); ipoib_drain_cq(dev); spin_lock_irq(&priv->lock); } list_splice_init(&priv->cm.rx_reap_list, &list); spin_unlock_irq(&priv->lock); list_for_each_entry_safe(p, n, &list, list) { ib_destroy_cm_id(p->id); ib_destroy_qp(p->qp); kfree(p); } cancel_delayed_work(&priv->cm.stale_task);}static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event){ struct ipoib_cm_tx *p = cm_id->context; struct ipoib_dev_priv *priv = netdev_priv(p->dev); struct ipoib_cm_data *data = event->private_data; struct sk_buff_head skqueue; struct ib_qp_attr qp_attr; int qp_attr_mask, ret; struct sk_buff *skb; p->mtu = be32_to_cpu(data->mtu); if (p->mtu <= IPOIB_ENCAP_LEN) { ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", p->mtu, IPOIB_ENCAP_LEN); return -EINVAL; } qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); return ret; } qp_attr.rq_psn = 0 /* FIXME */; ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } qp_attr.qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); return ret; } ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); return ret; } skb_queue_head_init(&skqueue); spin_lock_irq(&priv->lock); set_bit(IPOIB_FLAG_OPER_UP, &p->flags); if (p->neigh) while ((skb = __skb_dequeue(&p->neigh->queue))) __skb_queue_tail(&skqueue, skb); spin_unlock_irq(&priv->lock); while ((skb = __skb_dequeue(&skqueue))) { skb->dev = p->dev; if (dev_queue_xmit(skb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } ret = ib_send_cm_rtu(cm_id, NULL, 0); if (ret) { ipoib_warn(priv, "failed to send RTU: %d\n", ret); return ret; } return 0;}static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { .send_cq = priv->cq, .recv_cq = priv->cq, .srq = priv->cm.srq, .cap.max_send_wr = ipoib_sendq_size, .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = tx }; return ib_create_qp(priv->pd, &attr);}static int ipoib_cm_send_req(struct net_device *dev, struct ib_cm_id *id, struct ib_qp *qp, u32 qpn, struct ib_sa_path_rec *pathrec){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_data data = {}; struct ib_cm_req_param req = {}; data.qpn = cpu_to_be32(priv->qp->qp_num); data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); req.primary_path = pathrec; req.alternate_path = NULL; req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); req.qp_num = qp->qp_num; req.qp_type = qp->qp_type; req.private_data = &data; req.private_data_len = sizeof data; req.flow_control = 0; req.starting_psn = 0; /* FIXME */ /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ req.responder_resources = 4; req.remote_cm_response_timeout = 20; req.local_cm_response_timeout = 20; req.retry_count = 0; /* RFC draft warns against retries */ req.rnr_retry_count = 0; /* RFC draft warns against retries */ req.max_cm_retries = 15; req.srq = 1; return ib_send_cm_req(id, &req);}static int ipoib_cm_modify_tx_init(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp){ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; int qp_attr_mask, ret; ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); if (ret) { ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret); return ret; } qp_attr.qp_state = IB_QPS_INIT; qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; qp_attr.port_num = priv->port; qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); return ret; } return 0;}static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ib_sa_path_rec *pathrec){ struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); goto err_qp; } p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); if (IS_ERR(p->id)) { ret = PTR_ERR(p->id); ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); goto err_id; } ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); if (ret) { ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); goto err_modify; } ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); if (ret) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -