Add some comments, notes and TODOs about ->state_lock and RTNL. Signed-off-by: Sukadev Bhattiprolu <suka...@linux.ibm.com> --- Note: This is fixing lot of comments so not identifying fixes. It "seems" to fit this patch set but can send to net-next if necessary.
drivers/net/ethernet/ibm/ibmvnic.c | 58 ++++++++++++++++++++++++++++++ drivers/net/ethernet/ibm/ibmvnic.h | 51 +++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 236ec2456a38..1aae730ddafd 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1202,6 +1202,14 @@ static int ibmvnic_open(struct net_device *netdev) /* If device failover is pending, just set device state and return. * Device operation will be handled by reset routine. + * + * Note that ->failover_pending is not protected by ->state_lock + * because the tasklet (executing ibmvnic_handle_crq()) cannot + * block. Even otherwise this can deadlock due to CRQs issued in + * ibmvnic_open(). + * + * We check failover_pending again at the end in case of errors. + * so its okay if we miss the change to true here. */ if (adapter->failover_pending) { adapter->state = VNIC_OPEN; @@ -1380,6 +1388,9 @@ static int ibmvnic_close(struct net_device *netdev) /* If device failover is pending, just set device state and return. * Device operation will be handled by reset routine. + * + * Note that ->failover_pending is not protected by ->state_lock + * See comments in ibmvnic_open(). */ if (adapter->failover_pending) { adapter->state = VNIC_CLOSED; @@ -1930,6 +1941,14 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + /* + * TODO: Can this race with a reset? The reset could briefly + * set state to PROBED causing us to skip setting the + * mac address. When reset complets, we set the old mac + * address? Can we check ->resetting bit instead and + * save the new mac address in adapter->mac_addr + * so reset function can set it when it is done? + */ if (adapter->state != VNIC_PROBED) { ether_addr_copy(adapter->mac_addr, addr->sa_data); rc = __ibmvnic_set_mac(netdev, addr->sa_data); @@ -1941,6 +1960,14 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) /** * do_change_param_reset returns zero if we are able to keep processing reset * events, or non-zero if we hit a fatal error and must halt. + * + * Notes: + * - Regardless of success/failure, this function restores adapter state + * to what as it was on entry. In case of failure, it is assumed that + * a new hard-reset will be attempted. + * - Caller must hold the rtnl lock before calling and release upon + * return. + * */ static int do_change_param_reset(struct ibmvnic_adapter *adapter, enum ibmvnic_reset_reason reason) @@ -2039,6 +2066,11 @@ static int do_change_param_reset(struct ibmvnic_adapter *adapter, /** * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. + * + * Notes: + * - Regardless of success/failure, this function restores adapter state + * to what as it was on entry. In case of failure, it is assumed that + * a new hard-reset will be attempted. */ static int do_reset(struct ibmvnic_adapter *adapter, enum ibmvnic_reset_reason reason) @@ -2237,6 +2269,17 @@ static int do_reset(struct ibmvnic_adapter *adapter, return rc; } +/** + * Perform a hard reset possibly because a prior reset encountered + * an error. + * + * Notes: + * - Regardless of success/failure, this function restores adapter state + * to what as it was on entry. In case of failure, it is assumed that + * a new hard-reset will be attempted. + * - Caller must hold the rtnl lock before calling and release upon + * return. + */ static int do_hard_reset(struct ibmvnic_adapter *adapter, enum ibmvnic_reset_reason reason) { @@ -2651,6 +2694,11 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget) frames_processed++; } + /* TODO: Can this race with reset and/or is release_rx_pools()? + * Is that why we check for VNIC_CLOSING? What if we go to + * CLOSING just after we check? We cannot take ->state_lock + * since we are in interrupt context. + */ if (adapter->state != VNIC_CLOSING && ((atomic_read(&adapter->rx_pool[scrq_num].available) < adapter->req_rx_add_entries_per_subcrq / 2) || @@ -5358,6 +5406,9 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) } if (adapter->from_passive_init) { + /* ibmvnic_reset_init() is always called with ->state_lock + * held except from ibmvnic_probe(), so safe to update state. + */ adapter->state = VNIC_OPEN; adapter->from_passive_init = false; return -1; @@ -5531,6 +5582,9 @@ static int ibmvnic_remove(struct vio_dev *dev) adapter->state = VNIC_REMOVING; spin_unlock_irqrestore(&adapter->remove_lock, rmflags); + /* drop state_lock so __ibmvnic_reset() can make progress + * during flush_work() + */ mutex_unlock(&adapter->state_lock); flush_work(&adapter->ibmvnic_reset); @@ -5546,6 +5600,9 @@ static int ibmvnic_remove(struct vio_dev *dev) release_stats_token(adapter); release_stats_buffers(adapter); + /* Adapter going away. There better be no one checking ->state + * or getting state_lock now TODO: Do we need the REMOVED state? + */ adapter->state = VNIC_REMOVED; mutex_destroy(&adapter->state_lock); rtnl_unlock(); @@ -5627,6 +5684,7 @@ static int ibmvnic_resume(struct device *dev) struct net_device *netdev = dev_get_drvdata(dev); struct ibmvnic_adapter *adapter = netdev_priv(netdev); + /* resuming from power-down so ignoring state_lock */ if (adapter->state != VNIC_OPEN) return 0; diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index ac79dfa76333..d79bc9444c9f 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -963,6 +963,55 @@ struct ibmvnic_tunables { u64 mtu; }; +/** + * ->state_lock: + * Mutex to serialize read/write of adapter->state field specially + * between open and reset functions. If rtnl also needs to be held, + * acquire rtnl first and then state_lock (to be consistent with + * functions that enter ibmvnic with rtnl already held). + * + * In general, ->state_lock must be held for all read/writes to the + * state field. Exceptions are: + * - checks for VNIC_PROBING state - since the adapter is itself + * under construction and because we never go _back_ to PROBING. + * + * - in debug messages involving ->state + * + * - in ibmvnic_tx_interrupt(), ibmvnic_rx_interrupt() because + * a) this is a mutex and b) no (known) impact of getting a stale + * state (i.e we will likely recover on the next interrupt). + * + * - ibmvnic_resume() - we are resuming from a power-down state? + * + * - ibmvnic_reset() - see ->remove_lock below. + * + * Besides these, there are couple of TODOs in ibmvnic_poll() and + * ibmvnic_set_mac() that need to be investigated separately. + * + * ->remove_lock + * A spin lock used to serialize ibmvnic_reset() and ibmvnic_remove(). + * ibmvnic_reset() can be called from a tasklet which cannot block, + * so it cannot use the ->state_lock. The only states ibmvnic_reset() + * checks for are PROBING, REMOVING and REMOVED. PROBING can be ignored + * as mentioned above. On entering REMOVING state, ibmvnic_reset() + * will skip scheduling resets for the adapter. + * + * ->pending_resets[], ->next_reset: + * A "queue" of pending resets, implemented as a simple array. Resets + * are not frequent and even when they do occur, we will usually have + * just 1 or 2 entries in the queue at any time. Note that we don't + * need/allow duplicates in the queue. In the worst case, we would have + * VNIC_RESET_MAX-1 entries (but that means adapter is processing all + * valid types of resets at once!) so the slight overhead of the array + * is probably acceptable. + * + * We could still use a linked list but then have to deal with allocation + * failure when scheduling a reset. We sometimes enqueue reset from a + * tasklet so cannot block when we have allocation failure. Trying to + * close the adapter on failure requires us to hold the state_lock, which + * then cannot be a mutex (tasklet cannot block) - i.e complicates locking + * just for the occasional memory allocation failure. + */ struct ibmvnic_adapter { struct vio_dev *vdev; struct net_device *netdev; @@ -1098,6 +1147,6 @@ struct ibmvnic_adapter { struct ibmvnic_tunables desired; struct ibmvnic_tunables fallback; - /* Used for serialization of state field */ + /* see block comments above */ struct mutex state_lock; }; -- 2.26.2