From: AMEET M. PARANJAPE <aparanja@redhat.com> Date: Mon, 27 Oct 2008 14:19:58 -0500 Subject: [openib] race in ipoib_cm_post_receive_nonsrq Message-id: 4906145E.6030706@REDHAT.COM O-Subject: Re: [Fwd: Re: [PATCH RHEL5.3 BZ463485] Race in ipoib_cm_post_receive_nonsrq()] Bugzilla: 463485 RH-Acked-by: Doug Ledford <dledford@redhat.com> RH-Acked-by: David Howells <dhowells@redhat.com> RHBZ# ====== https://bugzilla.redhat.com/show_bug.cgi?id=463485 Description: =========== If a customer is running with ehca and has enabled Connect Mode (CM) mode, they will see random panics in the kernel because of a race condition. This patch eliminates the race during a receive Work Request/Scatter Gather List(WR/SGL) on ipoib_cm_post_receive_nonsrq() and ipoib_cm_post_receive_nonsqr(). RHEL Version Found: ================ RHEL 5.2 kABI Status: ============ No symbols were harmed. Brew: ===== Built on all platforms. http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1500347 Upstream Status: ================ Here is the community posting: http://lists.openfabrics.org/pipermail/general/2008-June/052233.html Test Status: ============ This patch was tested on two separate systems running multiple netperfs with ib_ipoib set to Connect mode. The test ran for 24 hours without error. Without the patch the test fails in less than 2 hours everytime. =============================================================== diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 5eba8af..5f87d20 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -145,18 +145,20 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id, int pi) } static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, - struct ipoib_cm_rx *rx, int id) + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_recv_wr *bad_wr; int i, ret; - priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; for (i = 0; i < IPOIB_CM_RX_SG; ++i) - priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i]; + sge[i].addr = rx->rx_ring[id].mapping[i]; - ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr); + ret = ib_post_recv(rx->qp, wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, @@ -357,10 +359,35 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev, return 0; } +static void ipoib_cm_init_rx_wr(struct net_device *dev, +struct ib_recv_wr *wr, +struct ib_sge *sge) +{ + + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->mr->lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = priv->cm.rx_sge; + wr->num_sge = priv->cm.num_frags; +} + static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx) { struct ipoib_dev_priv *priv = netdev_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; + int ret; int i; @@ -370,6 +397,13 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i "try using a lower value of recv_queue_size."); return -ENOMEM; } + t = kmalloc(sizeof *t, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free_ring; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); spin_lock_irq(&priv->lock); @@ -389,8 +423,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; - } - ret = ipoib_cm_post_receive_nonsrq(dev, rx, i); + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); if (ret) { ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " "failed for buf %d\n", i); @@ -401,6 +435,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i rx->recv_count = ipoib_recvq_size; + kfree(t); + return 0; err_count: @@ -409,6 +445,8 @@ err_count: spin_unlock_irq(&priv->lock); err_free: + kfree(t); +err_free_ring: ipoib_cm_free_rx_ring(dev, rx->rx_ring); return ret; @@ -677,7 +715,10 @@ repost: ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " "for buf %d\n", wr_id); } else { - if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { --p->recv_count; ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " "for buf %d\n", wr_id); @@ -1522,7 +1563,7 @@ destory_srq: int ipoib_cm_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - int i, ret, j; + int i, ret; struct ib_device_attr attr; INIT_LIST_HEAD(&priv->cm.passive_ids); @@ -1560,30 +1601,7 @@ int ipoib_cm_dev_init(struct net_device *dev) priv->cm.num_frags = IPOIB_CM_RX_SG; } - for (i = 0; i < priv->cm.num_frags; ++i) - priv->cm.rx_sge[i].lkey = priv->mr->lkey; - - if (ipoib_cm_has_srq(dev)) { - for (j = 0; j < ipoib_recvq_size; ++j) { - for (i = 0; i < priv->cm.num_frags; ++i) - priv->cm.rx_wr_arr[j].rx_sge[i].lkey = priv->mr->lkey; - - priv->cm.rx_wr_arr[j].rx_sge[0].length = IPOIB_CM_HEAD_SIZE; - for (i = 1; i < priv->cm.num_frags; ++i) - priv->cm.rx_wr_arr[j].rx_sge[i].length = PAGE_SIZE; - - priv->cm.rx_wr_arr[j].wr.sg_list = priv->cm.rx_wr_arr[j].rx_sge; - priv->cm.rx_wr_arr[j].wr.num_sge = priv->cm.num_frags; - } - priv->cm.head = &priv->cm.rx_wr_arr[0]; - } - - priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE; - for (i = 1; i < priv->cm.num_frags; ++i) - priv->cm.rx_sge[i].length = PAGE_SIZE; - priv->cm.rx_wr.next = NULL; - priv->cm.rx_wr.sg_list = priv->cm.rx_sge; - priv->cm.rx_wr.num_sge = priv->cm.num_frags; + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); if (ipoib_cm_has_srq(dev)) { for (i = 0; i < ipoib_recvq_size; ++i) {