The branch main has been updated by bz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=b8dfc3ecf7031a0a7764cdb67d85ebe0c03d5d93

commit b8dfc3ecf7031a0a7764cdb67d85ebe0c03d5d93
Author:     Bjoern A. Zeeb <b...@freebsd.org>
AuthorDate: 2025-03-06 13:17:19 +0000
Commit:     Bjoern A. Zeeb <b...@freebsd.org>
CommitDate: 2025-03-06 23:02:08 +0000

    LinuxKPI: 802.11: improve key update locking to work around net80211
    
    As indicated in 11db70b6057e there was another panic on key removal
    which could no longer be reproduced.  As originally assumed the
    problem was "hidden" by commit 9763fec11b83 as mentioned in
    11db70b6057e.
    Said commit had logic inverted and 27bf5c405bf2 fixed that and with
    that the possible panic came back.
    
    The problem exists because some code paths out of net80211 are
    locked while others are not. This opens a possible race in
    net80211 which was tracked by extra logging in
    (*iv_key_update_begin)() (log lines shortend):
            key_update_begin: tid 100112 vap X nt Y unlocked
            key_update_begin: tid 100133 vap X nt Y locked
    One thread can be wpa_supplicant, the other is driven from the
    driver net80211 taskq.
    Further LinuxKPI needs to unlock (conditionally in case the lock
    is held) as a downcall to the driver/FW may sleep.  This opens up
    possibilities for said race even further so that we observe it
    more reliably.
    
    This all leads to one thread calling down into the driver/firmware
    (unlocked) while the other will get to the same place (after acquiring
    the wiphy lock) before the nt re-lock happens and thus state checks
    did not catch this either.
    
    For LinuxKPI work around the problem utilizing
    (*iv_key_update_begin/end)() and taking the wiphy_lock() there
    holding it over the entire operation.
    Given we still have to conditionally unlock we need to keep track
    from _begin to _end on whether we have to re-lock.  The checks for
    this need to be done under the wiphy_lock().
    While a bool would suffice we use a refcount to make any future
    debugging easier.
    
    This isn't the most elegant solution but having the wiphy lock
    covering the full operation allows the 2nd thread to later run through
    the same code path and find the key gone (which we already checked).
    It remains questionable if (*iv_key_update_begin/end)() is the
    correct solution (as there are futher callers covering which would
    not need the unlock cycle) or if it should be done in the current
    code.  The former will allow us to cover a full key store which
    we will need in case we will implement suspend/resume beyond what
    is done in native drivers/net80211 currently, if we will factor
    out the crypto locking for good, and fix the inconsistent locking
    of the nt (NODE) lock in net80211.
    
    Alternate solutions were discussed on freebsd-wireless today
    (2025-03-06, in the thread "Re: HEADS UP! Do not update on main
    currently (panic - on boot)").
    
    Sponsored by:   The FreeBSD Foundation
    MFC after:      3 days
    X-MFC with:     27bf5c405bf2
    Differential Revision: https://reviews.freebsd.org/D49256
---
 sys/compat/linuxkpi/common/src/linux_80211.c | 99 +++++++++++++++++++++++-----
 sys/compat/linuxkpi/common/src/linux_80211.h |  2 +
 2 files changed, 85 insertions(+), 16 deletions(-)

diff --git a/sys/compat/linuxkpi/common/src/linux_80211.c 
b/sys/compat/linuxkpi/common/src/linux_80211.c
index be0006769e33..ec1798d2e886 100644
--- a/sys/compat/linuxkpi/common/src/linux_80211.c
+++ b/sys/compat/linuxkpi/common/src/linux_80211.c
@@ -1135,9 +1135,7 @@ _lkpi_iv_key_delete(struct ieee80211vap *vap, const 
struct ieee80211_key *k)
        struct ieee80211_sta *sta;
        struct ieee80211_node *ni;
        struct ieee80211_key_conf *kc;
-       struct ieee80211_node_table *nt;
        int error;
-       bool islocked;
 
        ic = vap->iv_ic;
        lhw = ic->ic_softc;
@@ -1172,13 +1170,6 @@ _lkpi_iv_key_delete(struct ieee80211vap *vap, const 
struct ieee80211_key *k)
                return (1);
        }
 
-       /* This is inconsistent net80211 locking to be fixed one day. */
-       nt = &ic->ic_sta;
-       islocked = IEEE80211_NODE_IS_LOCKED(nt);
-       if (islocked)
-               IEEE80211_NODE_UNLOCK(nt);
-
-       wiphy_lock(hw->wiphy);
        kc = lsta->kc[k->wk_keyix];
        /* Re-check under lock. */
        if (kc == NULL) {
@@ -1210,9 +1201,6 @@ _lkpi_iv_key_delete(struct ieee80211vap *vap, const 
struct ieee80211_key *k)
        free(kc, M_LKPI80211);
        error = 1;
 out:
-       wiphy_unlock(hw->wiphy);
-       if (islocked)
-               IEEE80211_NODE_LOCK(nt);
        ieee80211_free_node(ni);
        return (error);
 }
@@ -1262,7 +1250,6 @@ _lkpi_iv_key_set(struct ieee80211vap *vap, const struct 
ieee80211_key *k)
        }
        sta = LSTA_TO_STA(lsta);
 
-       wiphy_lock(hw->wiphy);
        if (lsta->kc[k->wk_keyix] != NULL) {
                IMPROVE("Still in firmware? Del first. Can we assert this 
cannot happen?");
                ic_printf(ic, "%s: sta %6D found with key information\n",
@@ -1283,7 +1270,6 @@ _lkpi_iv_key_set(struct ieee80211vap *vap, const struct 
ieee80211_key *k)
                ic_printf(ic, "%s: CIPHER SUITE %#x (%s) not supported\n",
                    __func__, lcipher, lkpi_cipher_suite_to_name(lcipher));
                IMPROVE();
-               wiphy_unlock(hw->wiphy);
                ieee80211_free_node(ni);
                return (0);
        }
@@ -1324,7 +1310,6 @@ _lkpi_iv_key_set(struct ieee80211vap *vap, const struct 
ieee80211_key *k)
                    __func__, SET_KEY, "SET", sta->addr, ":", error);
                lsta->kc[k->wk_keyix] = NULL;
                free(kc, M_LKPI80211);
-               wiphy_unlock(hw->wiphy);
                ieee80211_free_node(ni);
                return (0);
        }
@@ -1337,7 +1322,6 @@ _lkpi_iv_key_set(struct ieee80211vap *vap, const struct 
ieee80211_key *k)
                    kc, kc->keyidx, kc->hw_key_idx, kc->flags);
 #endif
 
-       wiphy_unlock(hw->wiphy);
        ieee80211_free_node(ni);
        return (1);
 }
@@ -1348,6 +1332,86 @@ lkpi_iv_key_set(struct ieee80211vap *vap, const struct 
ieee80211_key *k)
 
        return (_lkpi_iv_key_set(vap, k));
 }
+
+static void
+lkpi_iv_key_update_begin(struct ieee80211vap *vap)
+{
+       struct ieee80211_node_table *nt;
+       struct ieee80211com *ic;
+       struct lkpi_hw *lhw;
+       struct ieee80211_hw *hw;
+       struct lkpi_vif *lvif;
+       bool islocked;
+
+       ic = vap->iv_ic;
+       lhw = ic->ic_softc;
+       hw = LHW_TO_HW(lhw);
+       lvif = VAP_TO_LVIF(vap);
+       nt = &ic->ic_sta;
+
+       islocked = IEEE80211_NODE_IS_LOCKED(nt);
+
+#ifdef LINUXKPI_DEBUG_80211
+       if (linuxkpi_debug_80211 & D80211_TRACE_HW_CRYPTO)
+               ic_printf(vap->iv_ic, "%s: tid %d vap %p nt %p %slocked "
+                   "lvif nt_unlocked %d\n", __func__, curthread->td_tid,
+                   vap, nt, islocked ? "" : "un", lvif->nt_unlocked);
+#endif
+
+       /* This is inconsistent net80211 locking to be fixed one day. */
+       if (islocked)
+               IEEE80211_NODE_UNLOCK(nt);
+
+       wiphy_lock(hw->wiphy);
+
+       /*
+        * nt_unlocked could be a bool given we are under the lock and there
+        * must only be a single thread.
+        * In case anything in the future disturbs the order the refcnt will
+        * help us catching problems a lot easier.
+        */
+       if (islocked)
+               refcount_acquire(&lvif->nt_unlocked);
+}
+
+static void
+lkpi_iv_key_update_end(struct ieee80211vap *vap)
+{
+       struct ieee80211_node_table *nt;
+       struct ieee80211com *ic;
+       struct lkpi_hw *lhw;
+       struct ieee80211_hw *hw;
+       struct lkpi_vif *lvif;
+       bool islocked;
+
+       ic = vap->iv_ic;
+       lhw = ic->ic_softc;
+       hw = LHW_TO_HW(lhw);
+       lvif = VAP_TO_LVIF(vap);
+       nt = &ic->ic_sta;
+
+       islocked = IEEE80211_NODE_IS_LOCKED(nt);
+       MPASS(!islocked);
+
+#ifdef LINUXKPI_DEBUG_80211
+       if (linuxkpi_debug_80211 & D80211_TRACE_HW_CRYPTO)
+               ic_printf(vap->iv_ic, "%s: tid %d vap %p nt %p %slocked "
+                   "lvif nt_unlocked %d\n", __func__, curthread->td_tid,
+                   vap, nt, islocked ? "" : "un", lvif->nt_unlocked);
+#endif
+
+       /*
+        * Check under lock; see comment in lkpi_iv_key_update_begin().
+        * In case the refcnt gets out of sync locking in net80211 will
+        * quickly barf as well (trying to unlock a lock not held).
+        */
+       islocked = refcount_release_if_last(&lvif->nt_unlocked);
+       wiphy_unlock(hw->wiphy);
+
+       /* This is inconsistent net80211 locking to be fixed one day. */
+       if (islocked)
+               IEEE80211_NODE_LOCK(nt);
+}
 #endif
 
 static u_int
@@ -3413,6 +3477,7 @@ lkpi_ic_vap_create(struct ieee80211com *ic, const char 
name[IFNAMSIZ],
        mtx_init(&lvif->mtx, "lvif", NULL, MTX_DEF);
        INIT_LIST_HEAD(&lvif->lsta_list);
        lvif->lvif_bss = NULL;
+       refcount_init(&lvif->nt_unlocked, 0);
        lvif->lvif_bss_synched = false;
        vap = LVIF_TO_VAP(lvif);
 
@@ -3545,6 +3610,8 @@ lkpi_ic_vap_create(struct ieee80211com *ic, const char 
name[IFNAMSIZ],
        if (lkpi_hwcrypto && lhw->ops->set_key != NULL) {
                vap->iv_key_set = lkpi_iv_key_set;
                vap->iv_key_delete = lkpi_iv_key_delete;
+               vap->iv_key_update_begin = lkpi_iv_key_update_begin;
+               vap->iv_key_update_end = lkpi_iv_key_update_end;
        }
 #endif
 
diff --git a/sys/compat/linuxkpi/common/src/linux_80211.h 
b/sys/compat/linuxkpi/common/src/linux_80211.h
index b17e6072066c..c2d29b2dcc4b 100644
--- a/sys/compat/linuxkpi/common/src/linux_80211.h
+++ b/sys/compat/linuxkpi/common/src/linux_80211.h
@@ -186,6 +186,8 @@ struct lkpi_vif {
        struct list_head        lsta_list;
 
        struct lkpi_sta         *lvif_bss;
+
+       int                     nt_unlocked;                    /* Count of nt 
unlocks pending (*mo_set_key) */
        bool                    lvif_bss_synched;
        bool                    added_to_drv;                   /* Driver 
knows; i.e. we called add_interface(). */
 

Reply via email to