git: aefe0a8c32d3 - main - Refactor/optimize cpu_search_*().

Alexander Motin Wed, 28 Jul 2021 19:00:41 -0700

The branch main has been updated by mav:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=aefe0a8c32d370f2fdd0d0771eb59f8845beda17


commit aefe0a8c32d370f2fdd0d0771eb59f8845beda17
Author:     Alexander Motin <m...@freebsd.org>
AuthorDate: 2021-07-29 01:18:50 +0000
Commit:     Alexander Motin <m...@freebsd.org>
CommitDate: 2021-07-29 02:00:29 +0000

    Refactor/optimize cpu_search_*().
    
    Remove cpu_search_both(), unused for many years.  Without it there is
    less sense for the trick of compiling common cpu_search() into separate
    cpu_search_lowest() and cpu_search_highest(), so split them completely,
    making code more readable.  While there, split iteration over children
    groups and CPUs, complicating code for very small deduplication.
    
    Stop passing cpuset_t arguments by value and avoid some manipulations.
    Since MAXCPU bump from 64 to 256, what was a single register turned
    into 32-byte memory array, requiring memory allocation and accesses.
    Splitting struct cpu_search into parameter and result parts allows to
    even more reduce stack usage, since the first can be passed through
    on recursion.
    
    Remove CPU_FFS() from the hot paths, precalculating first and last CPU
    for each CPU group in advance during initialization.  Again, it was
    not a problem for 64 CPUs before, but for 256 FFS needs much more code.
    
    With these changes on 80-thread system doing ~260K uncached ZFS reads
    per second I observe ~30% reduction of time spent in cpu_search_*().
    
    MFC after:      1 month
---
 sys/kern/sched_ule.c | 290 +++++++++++++++++++++------------------------------
 sys/kern/subr_smp.c  |  12 +++
 sys/sys/smp.h        |   2 +
 3 files changed, 134 insertions(+), 170 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 094a3fffef8c..3bb73d64a70c 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -631,170 +631,120 @@ sched_random(void)
 }
 
 struct cpu_search {
-       cpuset_t cs_mask;
+       cpuset_t *cs_mask;
        u_int   cs_prefer;
        int     cs_pri;         /* Min priority for low. */
        int     cs_limit;       /* Max load for low, min load for high. */
+};
+
+struct cpu_search_res {
        int     cs_cpu;
        int     cs_load;
 };
 
-#define        CPU_SEARCH_LOWEST       0x1
-#define        CPU_SEARCH_HIGHEST      0x2
-#define        CPU_SEARCH_BOTH         (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
-
-static __always_inline int cpu_search(const struct cpu_group *cg,
-    struct cpu_search *low, struct cpu_search *high, const int match);
-int __noinline cpu_search_lowest(const struct cpu_group *cg,
-    struct cpu_search *low);
-int __noinline cpu_search_highest(const struct cpu_group *cg,
-    struct cpu_search *high);
-int __noinline cpu_search_both(const struct cpu_group *cg,
-    struct cpu_search *low, struct cpu_search *high);
-
-/*
- * Search the tree of cpu_groups for the lowest or highest loaded cpu
- * according to the match argument.  This routine actually compares the
- * load on all paths through the tree and finds the least loaded cpu on
- * the least loaded path, which may differ from the least loaded cpu in
- * the system.  This balances work among caches and buses.
- *
- * This inline is instantiated in three forms below using constants for the
- * match argument.  It is reduced to the minimum set for each case.  It is
- * also recursive to the depth of the tree.
- */
-static __always_inline int
-cpu_search(const struct cpu_group *cg, struct cpu_search *low,
-    struct cpu_search *high, const int match)
-{
-       struct cpu_search lgroup;
-       struct cpu_search hgroup;
-       cpuset_t cpumask;
-       struct cpu_group *child;
+/*
+ * Search the tree of cpu_groups for the lowest or highest loaded CPU.
+ * These routines actually compare the load on all paths through the tree
+ * and find the least loaded cpu on the least loaded path, which may differ
+ * from the least loaded cpu in the system.  This balances work among caches
+ * and buses.
+ */
+static int
+cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
+    struct cpu_search_res *r)
+{
+       struct cpu_search_res lr;
        struct tdq *tdq;
-       int cpu, i, hload, lload, load, total, rnd;
+       int c, bload, l, load, total;
 
        total = 0;
-       cpumask = cg->cg_mask;
-       if (match & CPU_SEARCH_LOWEST) {
-               lload = INT_MAX;
-               lgroup = *low;
-       }
-       if (match & CPU_SEARCH_HIGHEST) {
-               hload = INT_MIN;
-               hgroup = *high;
-       }
+       bload = INT_MAX;
+       r->cs_cpu = -1;
 
-       /* Iterate through the child CPU groups and then remaining CPUs. */
-       for (i = cg->cg_children, cpu = mp_maxid; ; ) {
-               if (i == 0) {
-#ifdef HAVE_INLINE_FFSL
-                       cpu = CPU_FFS(&cpumask) - 1;
-#else
-                       while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
-                               cpu--;
-#endif
-                       if (cpu < 0)
-                               break;
-                       child = NULL;
-               } else
-                       child = &cg->cg_child[i - 1];
-
-               if (match & CPU_SEARCH_LOWEST)
-                       lgroup.cs_cpu = -1;
-               if (match & CPU_SEARCH_HIGHEST)
-                       hgroup.cs_cpu = -1;
-               if (child) {                    /* Handle child CPU group. */
-                       CPU_ANDNOT(&cpumask, &child->cg_mask);
-                       switch (match) {
-                       case CPU_SEARCH_LOWEST:
-                               load = cpu_search_lowest(child, &lgroup);
-                               break;
-                       case CPU_SEARCH_HIGHEST:
-                               load = cpu_search_highest(child, &hgroup);
-                               break;
-                       case CPU_SEARCH_BOTH:
-                               load = cpu_search_both(child, &lgroup, &hgroup);
-                               break;
-                       }
-               } else {                        /* Handle child CPU. */
-                       CPU_CLR(cpu, &cpumask);
-                       tdq = TDQ_CPU(cpu);
-                       load = tdq->tdq_load * 256;
-                       rnd = sched_random() % 32;
-                       if (match & CPU_SEARCH_LOWEST) {
-                               if (cpu == low->cs_prefer)
-                                       load -= 64;
-                               /* If that CPU is allowed and get data. */
-                               if (tdq->tdq_lowpri > lgroup.cs_pri &&
-                                   tdq->tdq_load <= lgroup.cs_limit &&
-                                   CPU_ISSET(cpu, &lgroup.cs_mask)) {
-                                       lgroup.cs_cpu = cpu;
-                                       lgroup.cs_load = load - rnd;
-                               }
+       /* Loop through children CPU groups if there are any. */
+       if (cg->cg_children > 0) {
+               for (c = cg->cg_children - 1; c >= 0; c--) {
+                       load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
+                       total += load;
+                       if (lr.cs_cpu >= 0 && (load < bload ||
+                           (load == bload && lr.cs_load < r->cs_load))) {
+                               bload = load;
+                               r->cs_cpu = lr.cs_cpu;
+                               r->cs_load = lr.cs_load;
                        }
-                       if (match & CPU_SEARCH_HIGHEST)
-                               if (tdq->tdq_load >= hgroup.cs_limit &&
-                                   tdq->tdq_transferable &&
-                                   CPU_ISSET(cpu, &hgroup.cs_mask)) {
-                                       hgroup.cs_cpu = cpu;
-                                       hgroup.cs_load = load - rnd;
-                               }
                }
-               total += load;
+               return (total);
+       }
 
-               /* We have info about child item. Compare it. */
-               if (match & CPU_SEARCH_LOWEST) {
-                       if (lgroup.cs_cpu >= 0 &&
-                           (load < lload ||
-                            (load == lload && lgroup.cs_load < low->cs_load))) 
{
-                               lload = load;
-                               low->cs_cpu = lgroup.cs_cpu;
-                               low->cs_load = lgroup.cs_load;
-                       }
-               }
-               if (match & CPU_SEARCH_HIGHEST)
-                       if (hgroup.cs_cpu >= 0 &&
-                           (load > hload ||
-                            (load == hload && hgroup.cs_load > 
high->cs_load))) {
-                               hload = load;
-                               high->cs_cpu = hgroup.cs_cpu;
-                               high->cs_load = hgroup.cs_load;
-                       }
-               if (child) {
-                       i--;
-                       if (i == 0 && CPU_EMPTY(&cpumask))
-                               break;
+       /* Loop through children CPUs otherwise. */
+       for (c = cg->cg_last; c >= cg->cg_first; c--) {
+               if (!CPU_ISSET(c, &cg->cg_mask))
+                       continue;
+               tdq = TDQ_CPU(c);
+               l = tdq->tdq_load;
+               load = l * 256;
+               if (c == s->cs_prefer)
+                       load -= 128;
+               total += load;
+               if (l > s->cs_limit || tdq->tdq_lowpri <= s->cs_pri ||
+                   !CPU_ISSET(c, s->cs_mask))
+                       continue;
+               load -= sched_random() % 128;
+               if (load < bload) {
+                       bload = load;
+                       r->cs_cpu = c;
                }
-#ifndef HAVE_INLINE_FFSL
-               else
-                       cpu--;
-#endif
        }
+       r->cs_load = bload;
        return (total);
 }
 
-/*
- * cpu_search instantiations must pass constants to maintain the inline
- * optimization.
- */
-int
-cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
+static int
+cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
+    struct cpu_search_res *r)
 {
-       return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
-}
+       struct cpu_search_res lr;
+       struct tdq *tdq;
+       int c, bload, l, load, total;
 
-int
-cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
-{
-       return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
-}
+       total = 0;
+       bload = INT_MIN;
+       r->cs_cpu = -1;
 
-int
-cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
-    struct cpu_search *high)
-{
-       return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+       /* Loop through children CPU groups if there are any. */
+       if (cg->cg_children > 0) {
+               for (c = cg->cg_children - 1; c >= 0; c--) {
+                       load = cpu_search_highest(&cg->cg_child[c], s, &lr);
+                       total += load;
+                       if (lr.cs_cpu >= 0 && (load > bload ||
+                           (load == bload && lr.cs_load > r->cs_load))) {
+                               bload = load;
+                               r->cs_cpu = lr.cs_cpu;
+                               r->cs_load = lr.cs_load;
+                       }
+               }
+               return (total);
+       }
+
+       /* Loop through children CPUs otherwise. */
+       for (c = cg->cg_last; c >= cg->cg_first; c--) {
+               if (!CPU_ISSET(c, &cg->cg_mask))
+                       continue;
+               tdq = TDQ_CPU(c);
+               l = tdq->tdq_load;
+               load = l * 256;
+               total += load;
+               if (l < s->cs_limit || !tdq->tdq_transferable ||
+                   !CPU_ISSET(c, s->cs_mask))
+                       continue;
+               load -= sched_random() % 128;
+               if (load > bload) {
+                       bload = load;
+                       r->cs_cpu = c;
+               }
+       }
+       r->cs_load = bload;
+       return (total);
 }
 
 /*
@@ -803,33 +753,33 @@ cpu_search_both(const struct cpu_group *cg, struct 
cpu_search *low,
  * acceptable.
  */
 static inline int
-sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,
     int prefer)
 {
-       struct cpu_search low;
+       struct cpu_search s;
+       struct cpu_search_res r;
 
-       low.cs_cpu = -1;
-       low.cs_prefer = prefer;
-       low.cs_mask = mask;
-       low.cs_pri = pri;
-       low.cs_limit = maxload;
-       cpu_search_lowest(cg, &low);
-       return low.cs_cpu;
+       s.cs_prefer = prefer;
+       s.cs_mask = mask;
+       s.cs_pri = pri;
+       s.cs_limit = maxload;
+       cpu_search_lowest(cg, &s, &r);
+       return (r.cs_cpu);
 }
 
 /*
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
-sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
+sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload)
 {
-       struct cpu_search high;
+       struct cpu_search s;
+       struct cpu_search_res r;
 
-       high.cs_cpu = -1;
-       high.cs_mask = mask;
-       high.cs_limit = minload;
-       cpu_search_highest(cg, &high);
-       return high.cs_cpu;
+       s.cs_mask = mask;
+       s.cs_limit = minload;
+       cpu_search_highest(cg, &s, &r);
+       return (r.cs_cpu);
 }
 
 static void
@@ -841,7 +791,7 @@ sched_balance_group(struct cpu_group *cg)
 
        CPU_FILL(&hmask);
        for (;;) {
-               high = sched_highest(cg, hmask, 2);
+               high = sched_highest(cg, &hmask, 2);
                /* Stop if there is no more CPU with transferrable threads. */
                if (high == -1)
                        break;
@@ -853,7 +803,7 @@ sched_balance_group(struct cpu_group *cg)
                anylow = 1;
                tdq = TDQ_CPU(high);
 nextlow:
-               low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high);
+               low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high);
                /* Stop if we looked well and found no less loaded CPU. */
                if (anylow && low == -1)
                        break;
@@ -995,7 +945,7 @@ tdq_idled(struct tdq *tdq)
     restart:
        switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
        for (cg = tdq->tdq_cg; ; ) {
-               cpu = sched_highest(cg, mask, steal_thresh);
+               cpu = sched_highest(cg, &mask, steal_thresh);
                /*
                 * We were assigned a thread but not preempted.  Returning
                 * 0 here will cause our caller to switch to it.
@@ -1244,7 +1194,7 @@ sched_pickcpu(struct thread *td, int flags)
        struct cpu_group *cg, *ccg;
        struct td_sched *ts;
        struct tdq *tdq;
-       cpuset_t mask;
+       cpuset_t *mask;
        int cpu, pri, self, intr;
 
        self = PCPU_GET(cpuid);
@@ -1287,14 +1237,14 @@ sched_pickcpu(struct thread *td, int flags)
            SCHED_AFFINITY(ts, CG_SHARE_L2)) {
                if (cg->cg_flags & CG_FLAG_THREAD) {
                        /* Check all SMT threads for being idle. */
-                       for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) {
+                       for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {
                                if (CPU_ISSET(cpu, &cg->cg_mask) &&
                                    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
                                        break;
-                               if (cpu >= mp_maxid) {
-                                       SCHED_STAT_INC(pickcpu_idle_affinity);
-                                       return (ts->ts_cpu);
-                               }
+                       }
+                       if (cpu > cg->cg_last) {
+                               SCHED_STAT_INC(pickcpu_idle_affinity);
+                               return (ts->ts_cpu);
                        }
                } else {
                        SCHED_STAT_INC(pickcpu_idle_affinity);
@@ -1321,7 +1271,7 @@ llc:
        if (ccg == cpu_top)
                ccg = NULL;
        cpu = -1;
-       mask = td->td_cpuset->cs_mask;
+       mask = &td->td_cpuset->cs_mask;
        pri = td->td_priority;
        /*
         * Try hard to keep interrupts within found LLC.  Search the LLC for
@@ -1931,7 +1881,7 @@ tdq_trysteal(struct tdq *tdq)
        spinlock_enter();
        TDQ_UNLOCK(tdq);
        for (i = 1, cg = tdq->tdq_cg; ; ) {
-               cpu = sched_highest(cg, mask, steal_thresh);
+               cpu = sched_highest(cg, &mask, steal_thresh);
                /*
                 * If a thread was added while interrupts were disabled don't
                 * steal one here.
@@ -3002,7 +2952,7 @@ sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
struct cpu_group *cg,
        sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
            cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
        first = TRUE;
-       for (i = 0; i < MAXCPU; i++) {
+       for (i = cg->cg_first; i <= cg->cg_last; i++) {
                if (CPU_ISSET(i, &cg->cg_mask)) {
                        if (!first)
                                sbuf_printf(sb, ", ");
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 935fb6ee977c..bfe890d773f9 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -630,6 +630,17 @@ smp_rendezvous(void (* setup_func)(void *),
 
 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 
+static void
+smp_topo_fill(struct cpu_group *cg)
+{
+       int c;
+
+       for (c = 0; c < cg->cg_children; c++)
+               smp_topo_fill(&cg->cg_child[c]);
+       cg->cg_first = CPU_FFS(&cg->cg_mask) - 1;
+       cg->cg_last = CPU_FLS(&cg->cg_mask) - 1;
+}
+
 struct cpu_group *
 smp_topo(void)
 {
@@ -693,6 +704,7 @@ smp_topo(void)
                top = &top->cg_child[0];
                top->cg_parent = NULL;
        }
+       smp_topo_fill(top);
        return (top);
 }
 
diff --git a/sys/sys/smp.h b/sys/sys/smp.h
index a971ffbbd91b..cee1199015a7 100644
--- a/sys/sys/smp.h
+++ b/sys/sys/smp.h
@@ -81,6 +81,8 @@ struct cpu_group {
        struct cpu_group *cg_child;     /* Optional children groups. */
        cpuset_t        cg_mask;        /* Mask of cpus in this group. */
        int32_t         cg_count;       /* Count of cpus in this group. */
+       int32_t         cg_first;       /* First cpu in this group. */
+       int32_t         cg_last;        /* Last cpu in this group. */
        int16_t         cg_children;    /* Number of children groups. */
        int8_t          cg_level;       /* Shared cache level. */
        int8_t          cg_flags;       /* Traversal modifiers. */
_______________________________________________
dev-commits-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/dev-commits-src-all
To unsubscribe, send any mail to "dev-commits-src-all-unsubscr...@freebsd.org"

git: aefe0a8c32d3 - main - Refactor/optimize cpu_search_*().

Reply via email to