fff8100bc8e>] ? apic_timer_interrupt+0xe/0x20 [] ? ipoib_path_lookup+0x124/0x2d0 [ib_ipoib] [] ? ipoib_start_xmit+0x17c/0x430 [ib_ipoib] [] ? dev_hard_start_xmit+0x2c8/0x3f0 [] ? sch_direct_xmit+0x15a/0x1c0 [] ? dev_queue_xmit+0x388/0x4d0 [] ? ipoib_mcast_join_finish+0x2c7/0x510 [ib_ipoib] [] ? ipoib_mcast_sendonly_join_complete+0x1b8/0x1f0 [ib_ipoib] [] ? mcast_work_handler+0x1a6/0x710 [ib_sa] [] ? ib_send_mad+0xfe/0x3c0 [ib_mad] [] ? ib_get_cached_lmc+0xa3/0xb0 [ib_core] [] ? join_handler+0xeb/0x200 [ib_sa] [] ? ib_sa_mcmember_rec_callback+0x5c/0xa0 [ib_sa] [] ? recv_handler+0x3c/0x70 [ib_sa] [] ? ib_mad_completion_handler+0x844/0x9d0 [ib_mad] [] ? ib_mad_completion_handler+0x0/0x9d0 [ib_mad] [] ? worker_thread+0x170/0x2a0 [] ? autoremove_wake_function+0x0/0x40 [] ? worker_thread+0x0/0x2a0 [] ? kthread+0x96/0xa0 [] ? child_rip+0xa/0x20 Coinciding with stack trace is the following message: ib0: ib_address_create failed The code below in ipoib_mcast_join_finish() will note the above failure in the address handle but otherwise continue: ah = ipoib_create_ah(dev, priv->pd, &av); if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { The while loop at the bottom of ipoib_mcast_join_finish() will attempt to send queued multicast packets in mcast->pkt_queue and eventually end up in ipoib_mcast_send(): if (!mcast->ah) { if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) skb_queue_tail(&mcast->pkt_queue, skb); else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } My read is that the code will requeue the packet and return to the ipoib_mcast_join_finish() while loop and the stage is set for the "hung" task diagnostic as the while loop never sees a non-NULL ah, and will do nothing to resolve. There are GFP_ATOMIC allocates in the provider routines, so this is possible and should be dealt with. The test that induced the failure is associated with a host SM on the same server during a shutdown. This patch causes ipoib_mcast_join_finish() to exit with an error which will flush the queued mcast packets. Nothing is done to unwind the QP attached state so that subsequent sends from above will retry the join. Reviewed-by: Ram Vepa Reviewed-by: Gary Leshner Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier ‹