from:"Yury Norov"

[PATCH] powerpc: remove unneeded check in spu_gang_remove_ctx

2015-06-22 Thread Yury Norov

In discussion to patch "selinux: reduce locking overhead in
inode_free_security()" it was figured out that list_del_init()
is safe to be called multiple times on the same entry. In that
case, it was useful to move a check out of spinlock to decrease
lock contention. Here we can remove '!list_empty()' check completely
and decrease 'spu_gang_remove_ctx()' function size, and (sometimes)
avoid branch predictor misses.

Signed-off-by: Yury Norov 
---
 arch/powerpc/platforms/cell/spufs/gang.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/gang.c 
b/arch/powerpc/platforms/cell/spufs/gang.c
index 71a4432..67e3fc3 100644
--- a/arch/powerpc/platforms/cell/spufs/gang.c
+++ b/arch/powerpc/platforms/cell/spufs/gang.c
@@ -75,10 +75,10 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct 
spu_context *ctx)
 {
mutex_lock(&gang->mutex);
WARN_ON(ctx->gang != gang);
-   if (!list_empty(&ctx->aff_list)) {
-   list_del_init(&ctx->aff_list);
-   gang->aff_flags &= ~AFF_OFFSETS_SET;
-   }
+
+   list_del_init(&ctx->aff_list);
+   gang->aff_flags &= ~AFF_OFFSETS_SET;
+
list_del_init(&ctx->gang_list);
gang->contexts--;
mutex_unlock(&gang->mutex);
-- 
2.1.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 16/34] powerpc: use atomic find_bit() API where appropriate

2023-11-18 Thread Yury Norov

Fix opencoded find_and_{set,clear}_bit() by using dedicated functions.

Signed-off-by: Yury Norov 
---
 arch/powerpc/mm/book3s32/mmu_context.c | 10 ++---
 arch/powerpc/platforms/pasemi/dma_lib.c| 45 +-
 arch/powerpc/platforms/powernv/pci-sriov.c | 12 ++
 3 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu_context.c 
b/arch/powerpc/mm/book3s32/mmu_context.c
index 1922f9a6b058..7db19f173c2e 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -50,13 +50,11 @@ static unsigned long context_map[LAST_CONTEXT / 
BITS_PER_LONG + 1];
 
 unsigned long __init_new_context(void)
 {
-   unsigned long ctx = next_mmu_context;
+   unsigned long ctx;
 
-   while (test_and_set_bit(ctx, context_map)) {
-   ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-   if (ctx > LAST_CONTEXT)
-   ctx = 0;
-   }
+   ctx = find_and_set_next_bit(context_map, LAST_CONTEXT + 1, 
next_mmu_context);
+   if (ctx > LAST_CONTEXT)
+   ctx = 0;
next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 
return ctx;
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c 
b/arch/powerpc/platforms/pasemi/dma_lib.c
index 1be1f18f6f09..906dabee0132 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -118,14 +118,9 @@ static int pasemi_alloc_tx_chan(enum pasemi_dmachan_type 
type)
limit = MAX_TXCH;
break;
}
-retry:
-   bit = find_next_bit(txch_free, MAX_TXCH, start);
-   if (bit >= limit)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, txch_free))
-   goto retry;
-
-   return bit;
+
+   bit = find_and_clear_next_bit(txch_free, MAX_TXCH, start);
+   return bit < limit ? bit : -ENOSPC;
 }
 
 static void pasemi_free_tx_chan(int chan)
@@ -136,15 +131,9 @@ static void pasemi_free_tx_chan(int chan)
 
 static int pasemi_alloc_rx_chan(void)
 {
-   int bit;
-retry:
-   bit = find_first_bit(rxch_free, MAX_RXCH);
-   if (bit >= MAX_TXCH)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, rxch_free))
-   goto retry;
-
-   return bit;
+   int bit = find_and_clear_bit(rxch_free, MAX_RXCH);
+
+   return bit < MAX_TXCH ? bit : -ENOSPC;
 }
 
 static void pasemi_free_rx_chan(int chan)
@@ -374,16 +363,9 @@ EXPORT_SYMBOL(pasemi_dma_free_buf);
  */
 int pasemi_dma_alloc_flag(void)
 {
-   int bit;
+   int bit = find_and_clear_bit(flags_free, MAX_FLAGS);
 
-retry:
-   bit = find_first_bit(flags_free, MAX_FLAGS);
-   if (bit >= MAX_FLAGS)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, flags_free))
-   goto retry;
-
-   return bit;
+   return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_flag);
 
@@ -439,16 +421,9 @@ EXPORT_SYMBOL(pasemi_dma_clear_flag);
  */
 int pasemi_dma_alloc_fun(void)
 {
-   int bit;
-
-retry:
-   bit = find_first_bit(fun_free, MAX_FLAGS);
-   if (bit >= MAX_FLAGS)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, fun_free))
-   goto retry;
+   int bit = find_and_clear_bit(fun_free, MAX_FLAGS);
 
-   return bit;
+   return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_fun);
 
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c 
b/arch/powerpc/platforms/powernv/pci-sriov.c
index 59882da3e742..640e387e6d83 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -397,18 +397,12 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb 
*phb,
 
 static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
 {
-   int win;
+   int win = find_and_set_bit(&phb->ioda.m64_bar_alloc, 
phb->ioda.m64_bar_idx + 1);
 
-   do {
-   win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-   phb->ioda.m64_bar_idx + 1, 0);
-
-   if (win >= phb->ioda.m64_bar_idx + 1)
-   return -1;
-   } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+   if (win >= phb->ioda.m64_bar_idx + 1)
+   return -1;
 
set_bit(win, iov->used_m64_bar_mask);
-
return win;
 }
 
-- 
2.39.2

[PATCH 01/34] lib/find: add atomic find_bit() primitives

2023-11-18 Thread Yury Norov

Add helpers around test_and_{set,clear}_bit() that allow to search for
clear or set bits and flip them atomically.

The target patterns may look like this:

for (idx = 0; idx < nbits; idx++)
if (test_and_clear_bit(idx, bitmap))
do_something(idx);

Or like this:

do {
bit = find_first_bit(bitmap, nbits);
if (bit >= nbits)
return nbits;
} while (!test_and_clear_bit(bit, bitmap));
return bit;

In both cases, the opencoded loop may be converted to a single function
or iterator call. Correspondingly:

for_each_test_and_clear_bit(idx, bitmap, nbits)
do_something(idx);

Or:
return find_and_clear_bit(bitmap, nbits);

Obviously, the less routine code people have write themself, the less
probability to make a mistake.

Those are not only handy helpers but also resolve a non-trivial
issue of using non-atomic find_bit() together with atomic
test_and_{set,clear)_bit().

The trick is that find_bit() implies that the bitmap is a regular
non-volatile piece of memory, and compiler is allowed to use such
optimization techniques like re-fetching memory instead of caching it.

For example, find_first_bit() is implemented like this:

  for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
  val = addr[idx];
  if (val) {
  sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
  break;
  }
  }

On register-memory architectures, like x86, compiler may decide to
access memory twice - first time to compare against 0, and second time
to fetch its value to pass it to __ffs().

When running find_first_bit() on volatile memory, the memory may get
changed in-between, and for instance, it may lead to passing 0 to
__ffs(), which is undefined. This is a potentially dangerous call.

find_and_clear_bit() as a wrapper around test_and_clear_bit()
naturally treats underlying bitmap as a volatile memory and prevents
compiler from such optimizations.

Now that KCSAN is catching exactly this type of situations and warns on
undercover memory modifications. We can use it to reveal improper usage
of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.

The 1st patch of the series adds the following atomic primitives:

find_and_set_bit(addr, nbits);
find_and_set_next_bit(addr, nbits, start);
...

Here find_and_{set,clear} part refers to the corresponding
test_and_{set,clear}_bit function, and suffixes like _wrap or _lock
derive semantics from corresponding find() or test() functions.

For brevity, the naming omits the fact that we search for zero bit in
find_and_set, and correspondingly, search for set bit in find_and_clear
functions.

The patch also adds iterators with atomic semantics, like
for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
corresponding atomic operation with 'for_each'.

All users of find_bit() API, where heavy concurrency is expected,
are encouraged to switch to atomic find_and_bit() as appropriate.

Signed-off-by: Yury Norov 
---
 include/linux/find.h | 289 +++
 lib/find_bit.c   |  85 +
 2 files changed, 374 insertions(+)

diff --git a/include/linux/find.h b/include/linux/find.h
index 5e4f39ef2e72..e8567f336f42 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -32,6 +32,16 @@ extern unsigned long _find_first_and_bit(const unsigned long 
*addr1,
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned 
long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long 
size);
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long 
nbits);
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr, unsigned 
long nbits,
+   unsigned long start);
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned 
long nbits);
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr, 
unsigned long nbits,
+ unsigned long start);
+unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long 
nbits);
+unsigned long _find_and_clear_next_bit(volatile unsigned long *addr, unsigned 
long nbits,
+   unsigned long start);
+
 #ifdef __BIG_ENDIAN
 unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long 
size);
 unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
@@ -460,6 +470,267 @@ unsigned long __for_each_wrap(const unsigned long 
*bitmap, unsigned long size,
return bit < start ? bit : size;
 }
 
+/**
+ * find_and_set_bit - Find a zero bit and set it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap size in bits
+ *
+ * This function is designed to operate in concurr

[PATCH 00/34] biops: add atomig find_bit() operations

2023-11-18 Thread Yury Norov

Add helpers around test_and_{set,clear}_bit() that allow to search for
clear or set bits and flip them atomically.

The target patterns may look like this:

for (idx = 0; idx < nbits; idx++)
if (test_and_clear_bit(idx, bitmap))
do_something(idx);

Or like this:

do {
bit = find_first_bit(bitmap, nbits);
if (bit >= nbits)
return nbits;
} while (!test_and_clear_bit(bit, bitmap));
return bit;

In both cases, the opencoded loop may be converted to a single function
or iterator call. Correspondingly:

for_each_test_and_clear_bit(idx, bitmap, nbits)
do_something(idx);

Or:
return find_and_clear_bit(bitmap, nbits);

Obviously, the less routine code people have write themself, the less
probability to make a mistake. Patch #31 of this series fixes one such
error in perf/m1 codebase.

Those are not only handy helpers but also resolve a non-trivial
issue of using non-atomic find_bit() together with atomic
test_and_{set,clear)_bit().

The trick is that find_bit() implies that the bitmap is a regular
non-volatile piece of memory, and compiler is allowed to use such
optimization techniques like re-fetching memory instead of caching it.

For example, find_first_bit() is implemented like this:

  for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
  val = addr[idx];
  if (val) {
  sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
  break;
  }
  }

On register-memory architectures, like x86, compiler may decide to
access memory twice - first time to compare against 0, and second time
to fetch its value to pass it to __ffs().

When running find_first_bit() on volatile memory, the memory may get
changed in-between, and for instance, it may lead to passing 0 to
__ffs(), which is undefined. This is a potentially dangerous call.

find_and_clear_bit() as a wrapper around test_and_clear_bit()
naturally treats underlying bitmap as a volatile memory and prevents
compiler from such optimizations.

Now that KCSAN is catching exactly this type of situations and warns on
undercover memory modifications. We can use it to reveal improper usage
of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.

The 1st patch of the series adds the following atomic primitives:

find_and_set_bit(addr, nbits);
find_and_set_next_bit(addr, nbits, start);
...

Here find_and_{set,clear} part refers to the corresponding
test_and_{set,clear}_bit function, and suffixes like _wrap or _lock
derive semantics from corresponding find() or test() functions.

For brevity, the naming omits the fact that we search for zero bit in
find_and_set, and correspondingly, search for set bit in find_and_clear
functions.

The patch also adds iterators with atomic semantics, like
for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
corresponding atomic operation with 'for_each'.

This series is a result of discussion [1]. All find_bit() functions imply
exclusive access to the bitmaps. However, KCSAN reports quite a number
of warnings related to find_bit() API. Some of them are not pointing
to real bugs because in many situations people intentionally allow
concurrent bitmap operations.

If so, find_bit() can be annotated such that KCSAN will ignore it:

bit = data_race(find_first_bit(bitmap, nbits));

This series addresses the other important case where people really need
atomic find ops. As the following patches show, the resulting code
looks safer and more verbose comparing to opencoded loops followed by
atomic bit flips.

In [1] Mirsad reported 2% slowdown in a single-thread search test when
switching find_bit() function to treat bitmaps as volatile arrays. On
the other hand, kernel robot in the same thread reported +3.7% to the
performance of will-it-scale.per_thread_ops test.

Assuming that our compilers are sane and generate better code against
properly annotated data, the above discrepancy doesn't look weird. When
running on non-volatile bitmaps, plain find_bit() outperforms atomic
find_and_bit(), and vice-versa.

So, all users of find_bit() API, where heavy concurrency is expected,
are encouraged to switch to atomic find_and_bit() as appropriate.

1st patch of this series adds atomic find_and_bit() API, and all the
following patches spread it over the kernel. They can be applied
separately from each other on per-subsystems basis, or I can pull them
in bitmap tree, as appropriate.

[1] 
https://lore.kernel.org/lkml/634f5fdf-e236-42cf-be8d-48a581c21...@alu.unizg.hr/T/#m3e7341eb3571753f3acf8fe166f3fb5b2c12e615
 

Yury Norov (34):
  lib/find: add atomic find_bit() primitives
  lib/sbitmap; make __sbitmap_get_word() using find_and_set_bit()
  watch_queue: use atomic find_bit() in post_one_notification()
  sched: add cpumask_find_and_set() and use it

[PATCH] powerpc/64: don't refer nr_cpu_ids in asm code when it's undefined

2022-09-20 Thread Yury Norov

generic_secondary_common_init() calls LOAD_REG_ADDR(r7, nr_cpu_ids)
conditionally on CONFIG_SMP. However, if NR_CPUS == 1, kernel doesn't
use the nr_cpu_ids, and in C code, it's just:
  #if NR_CPUS == 1
  #define nr_cpu_ids
  ...

The [1] makes declaration of nr_cpu_ids conditional on NR_CPUS == 1,
and that reveals the issue: compiler can't link the
LOAD_REG_ADDR(r7, nr_cpu_ids) against nonexisting symbol.

Current code looks unsafe for those who build kernel with CONFIG_SMP=y and
NR_CPUS == 1. This is weird configuration, but not disallowed.

Fix the linker error by replacing LOAD_REG_ADDR() with LOAD_REG_IMMEDIATE()
conditionally on NR_CPUS == 1.

The issue was spotted after applying [1], which adds a CONFIG_FORCE_NR_CPUS
option that has the similar effect on nr_cpu_ids. So, in this patch, make
the LOAD_REG() conditional on CONFIG_FORCE_NR_CPUS too.

On top of:
[1] 
https://lore.kernel.org/lkml/20220905230820.3295223-4-yury.no...@gmail.com/T/#m96ffe122721893471fd3470d911a8f2fad6d03b3

Reported-by: Stephen Rothwell 
Signed-off-by: Yury Norov 
---
 arch/powerpc/kernel/head_64.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index cf2c08902c05..7cb97881635e 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -400,7 +400,11 @@ generic_secondary_common_init:
 #else
LOAD_REG_ADDR(r8, paca_ptrs)/* Load paca_ptrs pointe */
ld  r8,0(r8)/* Get base vaddr of array   */
+#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+   LOAD_REG_IMMEDIATE(r7, NR_CPUS)
+#else
LOAD_REG_ADDR(r7, nr_cpu_ids)   /* Load nr_cpu_ids address   */
+#endif
lwz r7,0(r7)/* also the max paca allocated   */
li  r5,0/* logical cpu id*/
 1:
-- 
2.34.1

Re: [PATCH] powerpc/64: don't refer nr_cpu_ids in asm code when it's undefined

2022-09-20 Thread Yury Norov

On Wed, Sep 21, 2022 at 08:20:06AM +1000, Stephen Rothwell wrote:
> Hi Yury,
> 
> On Tue, 20 Sep 2022 08:29:35 -0700 Yury Norov  wrote:
> >
> 
> > diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
> > index cf2c08902c05..7cb97881635e 100644
> > --- a/arch/powerpc/kernel/head_64.S
> > +++ b/arch/powerpc/kernel/head_64.S
> > @@ -400,7 +400,11 @@ generic_secondary_common_init:
> >  #else
> > LOAD_REG_ADDR(r8, paca_ptrs)/* Load paca_ptrs pointe */
> > ld  r8,0(r8)/* Get base vaddr of array   */
> > +#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
> > +   LOAD_REG_IMMEDIATE(r7, NR_CPUS)
> > +#else
> > LOAD_REG_ADDR(r7, nr_cpu_ids)   /* Load nr_cpu_ids address   */
> > +#endif
> > lwz r7,0(r7)/* also the max paca allocated   */
> > li  r5,0/* logical cpu id*/
> >  1:
> > -- 
> > 2.34.1
> > 
> 
> I don't know PPC assembly very well, but should the #endif be one line
> lower so that the constant is not dereferenced in the non-#else case? 

Looks like you're right. Thanks, I'll send a v2.

[PATCH v2] powerpc/64: don't refer nr_cpu_ids in asm code when it's undefined

2022-09-20 Thread Yury Norov

generic_secondary_common_init() calls LOAD_REG_ADDR(r7, nr_cpu_ids)
conditionally on CONFIG_SMP. However, if NR_CPUS == 1, kernel doesn't
use the nr_cpu_ids, and in C code, it's just:
  #if NR_CPUS == 1
  #define nr_cpu_ids
  ...

The [1] makes declaration of nr_cpu_ids conditional on NR_CPUS == 1,
and that reveals the issue: compiler can't link the
LOAD_REG_ADDR(r7, nr_cpu_ids) against nonexisting symbol.

Current code looks unsafe for those who build kernel with CONFIG_SMP=y and
NR_CPUS == 1. This is weird configuration, but not disallowed.

Fix the linker error by replacing LOAD_REG_ADDR() with LOAD_REG_IMMEDIATE()
conditionally on NR_CPUS == 1.

The issue was spotted after applying [1], which adds a CONFIG_FORCE_NR_CPUS
option that has the similar effect on nr_cpu_ids. So, in this patch, make
the LOAD_REG() conditional on CONFIG_FORCE_NR_CPUS too.

On top of:
[1] 
https://lore.kernel.org/lkml/20220905230820.3295223-4-yury.no...@gmail.com/T/#m96ffe122721893471fd3470d911a8f2fad6d03b3

Reported-by: Stephen Rothwell 
Signed-off-by: Yury Norov 
---
v2: move "lwz r7,0(r7)" under #else conditional.

 arch/powerpc/kernel/head_64.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index cf2c08902c05..d36939029701 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -400,8 +400,12 @@ generic_secondary_common_init:
 #else
LOAD_REG_ADDR(r8, paca_ptrs)/* Load paca_ptrs pointe */
ld  r8,0(r8)/* Get base vaddr of array   */
+#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
+   LOAD_REG_IMMEDIATE(r7, NR_CPUS)
+#else
LOAD_REG_ADDR(r7, nr_cpu_ids)   /* Load nr_cpu_ids address   */
lwz r7,0(r7)/* also the max paca allocated   */
+#endif
li  r5,0/* logical cpu id*/
 1:
sldir9,r5,3 /* get paca_ptrs[] index from cpu id */
-- 
2.34.1

Re: [PATCH v5 0/7] treewide cleanup of random integer usage

2022-10-08 Thread Yury Norov

On Fri, Oct 07, 2022 at 11:53:52PM -0600, Jason A. Donenfeld wrote:
> Changes v4->v5:
> - Coccinelle is now used for as much mechanical aspects as possible,
>   with mechanical parts split off from non-mechanical parts. This should
>   drastically reduce the amount of code that needs to be reviewed
>   carefully. Each commit mentions now if it was done by hand or is
>   mechanical.
> 
> Hi folks,
> 
> This is a five part treewide cleanup of random integer handling. The
> rules for random integers are:
> 
> - If you want a secure or an insecure random u64, use get_random_u64().
> - If you want a secure or an insecure random u32, use get_random_u32().
>   * The old function prandom_u32() has been deprecated for a while now
> and is just a wrapper around get_random_u32(). Same for
> get_random_int().
> - If you want a secure or an insecure random u16, use get_random_u16().
> - If you want a secure or an insecure random u8, use get_random_u8().
> - If you want secure or insecure random bytes, use get_random_bytes().
>   * The old function prandom_bytes() has been deprecated for a while now
> and has long been a wrapper around get_random_bytes().
> - If you want a non-uniform random u32, u16, or u8 bounded by a certain
>   open interval maximum, use prandom_u32_max().
>   * I say "non-uniform", because it doesn't do any rejection sampling or
> divisions. Hence, it stays within the prandom_* namespace.
> 
> These rules ought to be applied uniformly, so that we can clean up the
> deprecated functions, and earn the benefits of using the modern
> functions. In particular, in addition to the boring substitutions, this
> patchset accomplishes a few nice effects:
> 
> - By using prandom_u32_max() with an upper-bound that the compiler can
>   prove at compile-time is ≤65536 or ≤256, internally get_random_u16()
>   or get_random_u8() is used, which wastes fewer batched random bytes,
>   and hence has higher throughput.
> 
> - By using prandom_u32_max() instead of %, when the upper-bound is not a
>   constant, division is still avoided, because prandom_u32_max() uses
>   a faster multiplication-based trick instead.
> 
> - By using get_random_u16() or get_random_u8() in cases where the return
>   value is intended to indeed be a u16 or a u8, we waste fewer batched
>   random bytes, and hence have higher throughput.
> 
> So, based on those rules and benefits from following them, this patchset
> breaks down into the following five steps:
> 
> 1) Replace `prandom_u32() % max` and variants thereof with
>prandom_u32_max(max).
> 
>* Part 1 is done with Coccinelle. Part 2 is done by hand.
> 
> 2) Replace `(type)get_random_u32()` and variants thereof with
>get_random_u16() or get_random_u8(). I took the pains to actually
>look and see what every lvalue type was across the entire tree.
> 
>* Part 1 is done with Coccinelle. Part 2 is done by hand.
> 
> 3) Replace remaining deprecated uses of prandom_u32() and
>get_random_int() with get_random_u32(). 
> 
>* A boring search and replace operation.
> 
> 4) Replace remaining deprecated uses of prandom_bytes() with
>get_random_bytes().
> 
>* A boring search and replace operation.
> 
> 5) Remove the deprecated and now-unused prandom_u32() and
>prandom_bytes() inline wrapper functions.
> 
>* Just deleting code and updating comments.
> 
> I was thinking of taking this through my random.git tree (on which this
> series is currently based) and submitting it near the end of the merge
> window, or waiting for the very end of the 6.1 cycle when there will be
> the fewest new patches brewing. If somebody with some treewide-cleanup
> experience might share some wisdom about what the best timing usually
> winds up being, I'm all ears.
> 
> Please take a look! The number of lines touched is quite small, so this
> should be reviewable, and as much as is possible has been pushed into
> Coccinelle scripts.

For the series:
Reviewed-by: Yury Norov 

Although, looking at it, I have a feeling that kernel needs to drop all
fixed-size random APIs like get_random_uXX() or get_random_int(), because
people will continue using the 'get_random_int() % num' carelessly.

Thanks,
Yury

Re: [PATCH v3 0/2] Fix /proc/cpuinfo cpumask warning

2022-10-15 Thread Yury Norov

On Fri, Oct 14, 2022 at 05:58:43PM +0200, Andrew Jones wrote:
> Commit 78e5a3399421 ("cpumask: fix checking valid cpu range") has
> started issuing warnings[*] when cpu indices equal to nr_cpu_ids - 1
> are passed to cpumask_next* functions. seq_read_iter() and cpuinfo's
> start and next seq operations implement a pattern like
> 
>   n = cpumask_next(n - 1, mask);
>   show(n);
>   while (1) {
>   ++n;
>   n = cpumask_next(n - 1, mask);
>   if (n >= nr_cpu_ids)
>   break;
>   show(n);
>   }
> 
> which will issue the warning when reading /proc/cpuinfo.
> 
> [*] Warnings will only appear with DEBUG_PER_CPU_MAPS enabled.
> 
> This series address the issue for x86 and riscv, but from a quick
> grep of cpuinfo seq operations, I think at least openrisc, powerpc,
> and s390 also need an equivalent patch. While the test is simple (see
> next paragraph) I'm not equipped to test on each architecture.
> 
> To test, just build a kernel with DEBUG_PER_CPU_MAPS enabled, boot to
> a shell, do 'cat /proc/cpuinfo', and look for a kernel warning.
> 
> While the patches are being posted together in a series since they're
> for two different architectures they don't necessarily need to go
> through the same tree.

Acked-by: Yury Norov

Re: [PATCH v3 2/2] x86: Fix /proc/cpuinfo cpumask warning

2022-10-28 Thread Yury Norov

On Fri, Oct 28, 2022 at 09:48:28AM +0200, Andrew Jones wrote:
> Hi x86 maintainers,
> 
> I realize 78e5a3399421 has now been reverted, so this fix is no longer
> urgent. I don't believe it's wrong, though, so if it's still of interest,
> then please consider this a friendly ping.
> 
> Thanks,
> drew

Hi Andrew,

I'll take it in bitmap-for-next this weekend.

Thanks,
Yury

Re: [PATCH v3 2/2] x86: Fix /proc/cpuinfo cpumask warning

2022-10-29 Thread Yury Norov

On Fri, Oct 28, 2022, 10:03 AM Borislav Petkov  wrote:

> On Fri, Oct 28, 2022 at 07:46:08AM -0700, Yury Norov wrote:
> > I'll take it in bitmap-for-next this weekend.
>
> Why?

Because it's related to bitmap API usage and has been revealed after some
work in bitmaps.

And because nobody else cares.

If you're willing to move it yourself please go ahead.

>
>

[PATCH v2 18/35] powerpc: use atomic find_bit() API where appropriate

2023-12-03 Thread Yury Norov

Use find_and_{set,clear}_bit() where appropriate and simplify the logic.

Signed-off-by: Yury Norov 
---
 arch/powerpc/mm/book3s32/mmu_context.c | 10 ++---
 arch/powerpc/platforms/pasemi/dma_lib.c| 45 +-
 arch/powerpc/platforms/powernv/pci-sriov.c | 12 ++
 3 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu_context.c 
b/arch/powerpc/mm/book3s32/mmu_context.c
index 1922f9a6b058..7db19f173c2e 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -50,13 +50,11 @@ static unsigned long context_map[LAST_CONTEXT / 
BITS_PER_LONG + 1];
 
 unsigned long __init_new_context(void)
 {
-   unsigned long ctx = next_mmu_context;
+   unsigned long ctx;
 
-   while (test_and_set_bit(ctx, context_map)) {
-   ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-   if (ctx > LAST_CONTEXT)
-   ctx = 0;
-   }
+   ctx = find_and_set_next_bit(context_map, LAST_CONTEXT + 1, 
next_mmu_context);
+   if (ctx > LAST_CONTEXT)
+   ctx = 0;
next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 
return ctx;
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c 
b/arch/powerpc/platforms/pasemi/dma_lib.c
index 1be1f18f6f09..906dabee0132 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -118,14 +118,9 @@ static int pasemi_alloc_tx_chan(enum pasemi_dmachan_type 
type)
limit = MAX_TXCH;
break;
}
-retry:
-   bit = find_next_bit(txch_free, MAX_TXCH, start);
-   if (bit >= limit)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, txch_free))
-   goto retry;
-
-   return bit;
+
+   bit = find_and_clear_next_bit(txch_free, MAX_TXCH, start);
+   return bit < limit ? bit : -ENOSPC;
 }
 
 static void pasemi_free_tx_chan(int chan)
@@ -136,15 +131,9 @@ static void pasemi_free_tx_chan(int chan)
 
 static int pasemi_alloc_rx_chan(void)
 {
-   int bit;
-retry:
-   bit = find_first_bit(rxch_free, MAX_RXCH);
-   if (bit >= MAX_TXCH)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, rxch_free))
-   goto retry;
-
-   return bit;
+   int bit = find_and_clear_bit(rxch_free, MAX_RXCH);
+
+   return bit < MAX_TXCH ? bit : -ENOSPC;
 }
 
 static void pasemi_free_rx_chan(int chan)
@@ -374,16 +363,9 @@ EXPORT_SYMBOL(pasemi_dma_free_buf);
  */
 int pasemi_dma_alloc_flag(void)
 {
-   int bit;
+   int bit = find_and_clear_bit(flags_free, MAX_FLAGS);
 
-retry:
-   bit = find_first_bit(flags_free, MAX_FLAGS);
-   if (bit >= MAX_FLAGS)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, flags_free))
-   goto retry;
-
-   return bit;
+   return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_flag);
 
@@ -439,16 +421,9 @@ EXPORT_SYMBOL(pasemi_dma_clear_flag);
  */
 int pasemi_dma_alloc_fun(void)
 {
-   int bit;
-
-retry:
-   bit = find_first_bit(fun_free, MAX_FLAGS);
-   if (bit >= MAX_FLAGS)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, fun_free))
-   goto retry;
+   int bit = find_and_clear_bit(fun_free, MAX_FLAGS);
 
-   return bit;
+   return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_fun);
 
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c 
b/arch/powerpc/platforms/powernv/pci-sriov.c
index 59882da3e742..640e387e6d83 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -397,18 +397,12 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb 
*phb,
 
 static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
 {
-   int win;
+   int win = find_and_set_bit(&phb->ioda.m64_bar_alloc, 
phb->ioda.m64_bar_idx + 1);
 
-   do {
-   win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-   phb->ioda.m64_bar_idx + 1, 0);
-
-   if (win >= phb->ioda.m64_bar_idx + 1)
-   return -1;
-   } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+   if (win >= phb->ioda.m64_bar_idx + 1)
+   return -1;
 
set_bit(win, iov->used_m64_bar_mask);
-
return win;
 }
 
-- 
2.40.1

[PATCH v2 00/35] bitops: add atomic find_bit() operations

2023-12-03 Thread Yury Norov

1-31 ("drivers/perf: optimize 
m1_pmu_get_event_idx()...") @ Marc Zyngier;
 - Drop unneeded patch #v1-12 ("wifi: intel: use atomic find_bit() API...") @ 
Johannes Berg;
 - Patch #v1-15: split SCSI changes per subsystems @ Bart Van Assche;
 - Patch  #5: keep changes inside __mm_cid_try_get() @ Mathieu Desnoyers;
 - Patch  #8: use find_and_set_next_bit() @ Will Deacon;
 - Patch #13: keep test against stimer->config.enable @ Vitaly Kuznetsov;
 - Patch #15: use find_and_set_next_bit @ Bart Van Assche;
 - Patch #31: edit commit message @ Tony Lu, Alexandra Winter;
 - Patch #35: edit tag @ John Paul Adrian Glaubitz;

Yury Norov (35):
  lib/find: add atomic find_bit() primitives
  lib/find: add test for atomic find_bit() ops
  lib/sbitmap; make __sbitmap_get_word() using find_and_set_bit()
  watch_queue: use atomic find_bit() in post_one_notification()
  sched: add cpumask_find_and_set() and use it in __mm_cid_get()
  mips: sgi-ip30: rework heart_alloc_int()
  sparc: fix opencoded find_and_set_bit() in alloc_msi()
  perf/arm: optimize opencoded atomic find_bit() API
  drivers/perf: optimize ali_drw_get_counter_idx() by using find_bit()
  dmaengine: idxd: optimize perfmon_assign_event()
  ath10k: optimize ath10k_snoc_napi_poll() by using find_bit()
  wifi: rtw88: optimize rtw_pci_tx_kick_off() by using find_bit()
  KVM: x86: hyper-v: optimize and cleanup kvm_hv_process_stimers()
  PCI: hv: switch hv_get_dom_num() to use atomic find_bit()
  scsi: core: use atomic find_bit() API where appropriate
  scsi: mpi3mr: switch to using atomic find_and_set_bit()
  scsi: qedi: rework qedi_get_task_idx()
  powerpc: use atomic find_bit() API where appropriate
  iommu: use atomic find_bit() API where appropriate
  media: radio-shark: use atomic find_bit() API where appropriate
  sfc: switch to using atomic find_bit() API where appropriate
  tty: nozomi: optimize interrupt_handler()
  usb: cdc-acm: optimize acm_softint()
  block: null_blk: fix opencoded find_and_set_bit() in get_tag()
  RDMA/rtrs: fix opencoded find_and_set_bit_lock() in
__rtrs_get_permit()
  mISDN: optimize get_free_devid()
  media: em28xx: cx231xx: fix opencoded find_and_set_bit()
  ethernet: rocker: optimize ofdpa_port_internal_vlan_id_get()
  serial: sc12is7xx: optimize sc16is7xx_alloc_line()
  bluetooth: optimize cmtp_alloc_block_id()
  net: smc:  use find_and_set_bit() in smc_wr_tx_get_free_slot_index()
  ALSA: use atomic find_bit() functions where applicable
  m68k: rework get_mmu_context()
  microblaze: rework get_mmu_context()
  sh: mach-x3proto: rework ilsel_enable()

 arch/m68k/include/asm/mmu_context.h  |  11 +-
 arch/microblaze/include/asm/mmu_context_mm.h |  11 +-
 arch/mips/sgi-ip30/ip30-irq.c|  12 +-
 arch/powerpc/mm/book3s32/mmu_context.c   |  10 +-
 arch/powerpc/platforms/pasemi/dma_lib.c  |  45 +--
 arch/powerpc/platforms/powernv/pci-sriov.c   |  12 +-
 arch/sh/boards/mach-x3proto/ilsel.c  |   4 +-
 arch/sparc/kernel/pci_msi.c  |   9 +-
 arch/x86/kvm/hyperv.c|  39 ++-
 drivers/block/null_blk/main.c|  41 +--
 drivers/dma/idxd/perfmon.c   |   8 +-
 drivers/infiniband/ulp/rtrs/rtrs-clt.c   |  15 +-
 drivers/iommu/arm/arm-smmu/arm-smmu.h|  10 +-
 drivers/iommu/msm_iommu.c|  18 +-
 drivers/isdn/mISDN/core.c|   9 +-
 drivers/media/radio/radio-shark.c|   5 +-
 drivers/media/radio/radio-shark2.c   |   5 +-
 drivers/media/usb/cx231xx/cx231xx-cards.c|  16 +-
 drivers/media/usb/em28xx/em28xx-cards.c  |  37 +--
 drivers/net/ethernet/rocker/rocker_ofdpa.c   |  11 +-
 drivers/net/ethernet/sfc/rx_common.c |   4 +-
 drivers/net/ethernet/sfc/siena/rx_common.c   |   4 +-
 drivers/net/ethernet/sfc/siena/siena_sriov.c |  14 +-
 drivers/net/wireless/ath/ath10k/snoc.c   |   9 +-
 drivers/net/wireless/realtek/rtw88/pci.c |   5 +-
 drivers/net/wireless/realtek/rtw89/pci.c |   5 +-
 drivers/pci/controller/pci-hyperv.c  |   7 +-
 drivers/perf/alibaba_uncore_drw_pmu.c|  10 +-
 drivers/perf/arm-cci.c   |  24 +-
 drivers/perf/arm-ccn.c   |  10 +-
 drivers/perf/arm_dmc620_pmu.c|   9 +-
 drivers/perf/arm_pmuv3.c |   8 +-
 drivers/scsi/mpi3mr/mpi3mr_os.c  |  21 +-
 drivers/scsi/qedi/qedi_main.c|   9 +-
 drivers/scsi/scsi_lib.c  |   7 +-
 drivers/tty/nozomi.c |   5 +-
 drivers/tty/serial/sc16is7xx.c   |   8 +-
 drivers/usb/class/cdc-acm.c  |   5 +-
 include/linux/cpumask.h  |  12 +
 include/linux/find.h | 293 +++
 kernel/sched/sched.h |  14 +-
 kernel/watch_queue.c |   6 +-
 lib/find_bit.c   |  85 ++
 lib/sbi

[PATCH v2 01/35] lib/find: add atomic find_bit() primitives

2023-12-03 Thread Yury Norov

Add helpers around test_and_{set,clear}_bit() that allow to search for
clear or set bits and flip them atomically.

The target patterns may look like this:

for (idx = 0; idx < nbits; idx++)
if (test_and_clear_bit(idx, bitmap))
do_something(idx);

Or like this:

do {
bit = find_first_bit(bitmap, nbits);
if (bit >= nbits)
return nbits;
} while (!test_and_clear_bit(bit, bitmap));
return bit;

In both cases, the opencoded loop may be converted to a single function
or iterator call. Correspondingly:

for_each_test_and_clear_bit(idx, bitmap, nbits)
do_something(idx);

Or:
return find_and_clear_bit(bitmap, nbits);

Obviously, the less routine code people have to write themself, the
less probability to make a mistake.

Those are not only handy helpers but also resolve a non-trivial
issue of using non-atomic find_bit() together with atomic
test_and_{set,clear)_bit().

The trick is that find_bit() implies that the bitmap is a regular
non-volatile piece of memory, and compiler is allowed to use such
optimization techniques like re-fetching memory instead of caching it.

For example, find_first_bit() is implemented like this:

  for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
  val = addr[idx];
  if (val) {
  sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
  break;
  }
  }

On register-memory architectures, like x86, compiler may decide to
access memory twice - first time to compare against 0, and second time
to fetch its value to pass it to __ffs().

When running find_first_bit() on volatile memory, the memory may get
changed in-between, and for instance, it may lead to passing 0 to
__ffs(), which is undefined. This is a potentially dangerous call.

find_and_clear_bit() as a wrapper around test_and_clear_bit()
naturally treats underlying bitmap as a volatile memory and prevents
compiler from such optimizations.

Now that KCSAN is catching exactly this type of situations and warns on
undercover memory modifications. We can use it to reveal improper usage
of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.

The 1st patch of the series adds the following atomic primitives:

find_and_set_bit(addr, nbits);
find_and_set_next_bit(addr, nbits, start);
...

Here find_and_{set,clear} part refers to the corresponding
test_and_{set,clear}_bit function. Suffixes like _wrap or _lock
derive their semantics from corresponding find() or test() functions.

For brevity, the naming omits the fact that we search for zero bit in
find_and_set, and correspondingly search for set bit in find_and_clear
functions.

The patch also adds iterators with atomic semantics, like
for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
corresponding atomic operation with 'for_each'.

All users of find_bit() API, where heavy concurrency is expected,
are encouraged to switch to atomic find_and_bit() as appropriate.

CC: Bart Van Assche 
CC: Sergey Shtylyov 
Signed-off-by: Yury Norov 
---
 include/linux/find.h | 293 +++
 lib/find_bit.c   |  85 +
 2 files changed, 378 insertions(+)

diff --git a/include/linux/find.h b/include/linux/find.h
index 5e4f39ef2e72..79b0e2589725 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -32,6 +32,16 @@ extern unsigned long _find_first_and_bit(const unsigned long 
*addr1,
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned 
long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long 
size);
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long 
nbits);
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr, unsigned 
long nbits,
+   unsigned long start);
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned 
long nbits);
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr, 
unsigned long nbits,
+ unsigned long start);
+unsigned long _find_and_clear_bit(volatile unsigned long *addr, unsigned long 
nbits);
+unsigned long _find_and_clear_next_bit(volatile unsigned long *addr, unsigned 
long nbits,
+   unsigned long start);
+
 #ifdef __BIG_ENDIAN
 unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long 
size);
 unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
@@ -460,6 +470,267 @@ unsigned long __for_each_wrap(const unsigned long 
*bitmap, unsigned long size,
return bit < start ? bit : size;
 }
 
+/**
+ * find_and_set_bit - Find a zero bit and set it atomically
+ * @addr: The address to base the search on
+ * @nbits: The bitmap size in bits
+ *
+ * Thi

[PATCH v2 02/35] lib/find: add test for atomic find_bit() ops

2023-12-03 Thread Yury Norov

Add basic functionality test for new API.

Signed-off-by: Yury Norov 
---
 lib/test_bitmap.c | 61 +++
 1 file changed, 61 insertions(+)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 65f22c2578b0..277e1ca9fd28 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -221,6 +221,65 @@ static void __init test_zero_clear(void)
expect_eq_pbl("", bmap, 1024);
 }
 
+static void __init test_find_and_bit(void)
+{
+   unsigned long w, w_part, bit, cnt = 0;
+   DECLARE_BITMAP(bmap, EXP1_IN_BITS);
+
+   /*
+* Test find_and_clear{_next}_bit() and corresponding
+* iterators
+*/
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+
+   for_each_test_and_clear_bit(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(w, cnt);
+   expect_eq_uint(0, bitmap_weight(bmap, EXP1_IN_BITS));
+
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+   w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+
+   cnt = 0;
+   bit = EXP1_IN_BITS / 3;
+   for_each_test_and_clear_bit_from(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(bitmap_weight(bmap, EXP1_IN_BITS), bitmap_weight(bmap, 
EXP1_IN_BITS / 3));
+   expect_eq_uint(w_part, bitmap_weight(bmap, EXP1_IN_BITS));
+   expect_eq_uint(w - w_part, cnt);
+
+   /*
+* Test find_and_set{_next}_bit() and corresponding
+* iterators
+*/
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+   cnt = 0;
+
+   for_each_test_and_set_bit(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(EXP1_IN_BITS - w, cnt);
+   expect_eq_uint(EXP1_IN_BITS, bitmap_weight(bmap, EXP1_IN_BITS));
+
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+   w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+   cnt = 0;
+
+   bit = EXP1_IN_BITS / 3;
+   for_each_test_and_set_bit_from(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(EXP1_IN_BITS - bitmap_weight(bmap, EXP1_IN_BITS),
+   EXP1_IN_BITS / 3 - bitmap_weight(bmap, EXP1_IN_BITS / 
3));
+   expect_eq_uint(EXP1_IN_BITS * 2 / 3 - (w - w_part), cnt);
+}
+
 static void __init test_find_nth_bit(void)
 {
unsigned long b, bit, cnt = 0;
@@ -1273,6 +1332,8 @@ static void __init selftest(void)
test_for_each_clear_bitrange_from();
test_for_each_set_clump8();
test_for_each_set_bit_wrap();
+
+   test_find_and_bit();
 }
 
 KSTM_MODULE_LOADERS(test_bitmap);
-- 
2.40.1

Re: [PATCH v2 00/35] bitops: add atomic find_bit() operations

2023-12-05 Thread Yury Norov

On Mon, Dec 04, 2023 at 07:51:01PM +0100, Jan Kara wrote:
> Hello Yury!
> 
> On Sun 03-12-23 11:23:47, Yury Norov wrote:
> > Add helpers around test_and_{set,clear}_bit() that allow to search for
> > clear or set bits and flip them atomically.
> > 
> > The target patterns may look like this:
> > 
> > for (idx = 0; idx < nbits; idx++)
> > if (test_and_clear_bit(idx, bitmap))
> > do_something(idx);
> > 
> > Or like this:
> > 
> > do {
> > bit = find_first_bit(bitmap, nbits);
> > if (bit >= nbits)
> > return nbits;
> > } while (!test_and_clear_bit(bit, bitmap));
> > return bit;
> > 
> > In both cases, the opencoded loop may be converted to a single function
> > or iterator call. Correspondingly:
> > 
> > for_each_test_and_clear_bit(idx, bitmap, nbits)
> > do_something(idx);
> > 
> > Or:
> > return find_and_clear_bit(bitmap, nbits);
> 
> These are fine cleanups but they actually don't address the case that has
> triggered all these changes - namely the xarray use of find_next_bit() in
> xas_find_chunk().
> 
> ...
> > This series is a result of discussion [1]. All find_bit() functions imply
> > exclusive access to the bitmaps. However, KCSAN reports quite a number
> > of warnings related to find_bit() API. Some of them are not pointing
> > to real bugs because in many situations people intentionally allow
> > concurrent bitmap operations.
> > 
> > If so, find_bit() can be annotated such that KCSAN will ignore it:
> > 
> > bit = data_race(find_first_bit(bitmap, nbits));
> 
> No, this is not a correct thing to do. If concurrent bitmap changes can
> happen, find_first_bit() as it is currently implemented isn't ever a safe
> choice because it can call __ffs(0) which is dangerous as you properly note
> above. I proposed adding READ_ONCE() into find_first_bit() / find_next_bit()
> implementation to fix this issue but you disliked that. So other option we
> have is adding find_first_bit() and find_next_bit() variants that take
> volatile 'addr' and we have to use these in code like xas_find_chunk()
> which cannot be converted to your new helpers.

Here is some examples when concurrent operations with plain find_bit()
are acceptable:

 - two threads running find_*_bit(): safe wrt ffs(0) and returns correct
   value, because underlying bitmap is unchanged;
 - find_next_bit() in parallel with set or clear_bit(), when modifying
   a bit prior to the start bit to search: safe and correct;
 - find_first_bit() in parallel with set_bit(): safe, but may return wrong
   bit number;
 - find_first_zero_bit() in parallel with clear_bit(): same as above.

In last 2 cases find_bit() may not return a correct bit number, but
it may be OK if caller requires any (not exactly first) set or clear
bit, correspondingly.

In such cases, KCSAN may be safely silenced.
 
> > This series addresses the other important case where people really need
> > atomic find ops. As the following patches show, the resulting code
> > looks safer and more verbose comparing to opencoded loops followed by
> > atomic bit flips.
> > 
> > In [1] Mirsad reported 2% slowdown in a single-thread search test when
> > switching find_bit() function to treat bitmaps as volatile arrays. On
> > the other hand, kernel robot in the same thread reported +3.7% to the
> > performance of will-it-scale.per_thread_ops test.
> 
> It was actually me who reported the regression here [2] but whatever :)
> 
> [2] https://lore.kernel.org/all/20231011150252.32737-1-j...@suse.cz

My apologize.

> > Assuming that our compilers are sane and generate better code against
> > properly annotated data, the above discrepancy doesn't look weird. When
> > running on non-volatile bitmaps, plain find_bit() outperforms atomic
> > find_and_bit(), and vice-versa.
> > 
> > So, all users of find_bit() API, where heavy concurrency is expected,
> > are encouraged to switch to atomic find_and_bit() as appropriate.
> 
> Well, all users where any concurrency can happen should switch. Otherwise
> they are prone to the (admittedly mostly theoretical) data race issue.
> 
>   Honza
> -- 
> Jan Kara 
> SUSE Labs, CR

[PATCH v3 18/35] powerpc: optimize arch code by using atomic find_bit() API

2023-12-11 Thread Yury Norov

Use find_and_{set,clear}_bit() where appropriate and simplify the logic.

Signed-off-by: Yury Norov 
---
 arch/powerpc/mm/book3s32/mmu_context.c | 10 ++---
 arch/powerpc/platforms/pasemi/dma_lib.c| 45 +-
 arch/powerpc/platforms/powernv/pci-sriov.c | 12 ++
 3 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu_context.c 
b/arch/powerpc/mm/book3s32/mmu_context.c
index 1922f9a6b058..7db19f173c2e 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -50,13 +50,11 @@ static unsigned long context_map[LAST_CONTEXT / 
BITS_PER_LONG + 1];
 
 unsigned long __init_new_context(void)
 {
-   unsigned long ctx = next_mmu_context;
+   unsigned long ctx;
 
-   while (test_and_set_bit(ctx, context_map)) {
-   ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-   if (ctx > LAST_CONTEXT)
-   ctx = 0;
-   }
+   ctx = find_and_set_next_bit(context_map, LAST_CONTEXT + 1, 
next_mmu_context);
+   if (ctx > LAST_CONTEXT)
+   ctx = 0;
next_mmu_context = (ctx + 1) & LAST_CONTEXT;
 
return ctx;
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c 
b/arch/powerpc/platforms/pasemi/dma_lib.c
index 1be1f18f6f09..906dabee0132 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -118,14 +118,9 @@ static int pasemi_alloc_tx_chan(enum pasemi_dmachan_type 
type)
limit = MAX_TXCH;
break;
}
-retry:
-   bit = find_next_bit(txch_free, MAX_TXCH, start);
-   if (bit >= limit)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, txch_free))
-   goto retry;
-
-   return bit;
+
+   bit = find_and_clear_next_bit(txch_free, MAX_TXCH, start);
+   return bit < limit ? bit : -ENOSPC;
 }
 
 static void pasemi_free_tx_chan(int chan)
@@ -136,15 +131,9 @@ static void pasemi_free_tx_chan(int chan)
 
 static int pasemi_alloc_rx_chan(void)
 {
-   int bit;
-retry:
-   bit = find_first_bit(rxch_free, MAX_RXCH);
-   if (bit >= MAX_TXCH)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, rxch_free))
-   goto retry;
-
-   return bit;
+   int bit = find_and_clear_bit(rxch_free, MAX_RXCH);
+
+   return bit < MAX_TXCH ? bit : -ENOSPC;
 }
 
 static void pasemi_free_rx_chan(int chan)
@@ -374,16 +363,9 @@ EXPORT_SYMBOL(pasemi_dma_free_buf);
  */
 int pasemi_dma_alloc_flag(void)
 {
-   int bit;
+   int bit = find_and_clear_bit(flags_free, MAX_FLAGS);
 
-retry:
-   bit = find_first_bit(flags_free, MAX_FLAGS);
-   if (bit >= MAX_FLAGS)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, flags_free))
-   goto retry;
-
-   return bit;
+   return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_flag);
 
@@ -439,16 +421,9 @@ EXPORT_SYMBOL(pasemi_dma_clear_flag);
  */
 int pasemi_dma_alloc_fun(void)
 {
-   int bit;
-
-retry:
-   bit = find_first_bit(fun_free, MAX_FLAGS);
-   if (bit >= MAX_FLAGS)
-   return -ENOSPC;
-   if (!test_and_clear_bit(bit, fun_free))
-   goto retry;
+   int bit = find_and_clear_bit(fun_free, MAX_FLAGS);
 
-   return bit;
+   return bit < MAX_FLAGS ? bit : -ENOSPC;
 }
 EXPORT_SYMBOL(pasemi_dma_alloc_fun);
 
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c 
b/arch/powerpc/platforms/powernv/pci-sriov.c
index 59882da3e742..640e387e6d83 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -397,18 +397,12 @@ static int64_t pnv_ioda_map_m64_single(struct pnv_phb 
*phb,
 
 static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
 {
-   int win;
+   int win = find_and_set_bit(&phb->ioda.m64_bar_alloc, 
phb->ioda.m64_bar_idx + 1);
 
-   do {
-   win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
-   phb->ioda.m64_bar_idx + 1, 0);
-
-   if (win >= phb->ioda.m64_bar_idx + 1)
-   return -1;
-   } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+   if (win >= phb->ioda.m64_bar_idx + 1)
+   return -1;
 
set_bit(win, iov->used_m64_bar_mask);
-
return win;
 }
 
-- 
2.40.1

[PATCH v3 00/35] bitops: add atomic find_bit() operations

2023-12-11 Thread Yury Norov

//lore.kernel.org/netdev/20231118155105.25678-29-yury.no...@gmail.com/T/
v2: https://lore.kernel.org/all/20231204185101.ddmkvsr2xxsmoh2u@quack3/T/
v3:
 - collect more reviews;
 - align wording in commit messages @ Bjorn Helgaas;
 - add examples where non-atomic find_bit() may safely race @ Jan Kara;
 - patch  #3: use if-else instead of ternary operator @ Jens Axboe;
 - patch #13: align coding style @ Vitaly Kuznetsov, Sean Christopherson;

Yury Norov (35):
  lib/find: add atomic find_bit() primitives
  lib/find: add test for atomic find_bit() ops
  lib/sbitmap; optimize __sbitmap_get_word() by using find_and_set_bit()
  watch_queue: optimize post_one_notification() by using
find_and_clear_bit()
  sched: add cpumask_find_and_set() and use it in __mm_cid_get()
  mips: sgi-ip30: optimize heart_alloc_int() by using find_and_set_bit()
  sparc: optimize alloc_msi() by using find_and_set_bit()
  perf/arm: use atomic find_bit() API
  drivers/perf: optimize ali_drw_get_counter_idx() by using
find_and_set_bit()
  dmaengine: idxd: optimize perfmon_assign_event()
  ath10k: optimize ath10k_snoc_napi_poll() with an atomic iterator
  wifi: rtw88: optimize the driver by using atomic iterator
  KVM: x86: hyper-v: optimize and cleanup kvm_hv_process_stimers()
  PCI: hv: Optimize hv_get_dom_num() by using find_and_set_bit()
  scsi: core: optimize scsi_evt_emit() by using an atomic iterator
  scsi: mpi3mr: optimize the driver by using find_and_set_bit()
  scsi: qedi: optimize qedi_get_task_idx() by using find_and_set_bit()
  powerpc: optimize arch code by using atomic find_bit() API
  iommu: optimize subsystem by using atomic find_bit() API
  media: radio-shark: optimize driver by using atomic find_bit() API
  sfc: optimize driver by using atomic find_bit() API
  tty: nozomi: optimize interrupt_handler()
  usb: cdc-acm: optimize acm_softint()
  block: null_blk: replace get_tag() with a generic
find_and_set_bit_lock()
  RDMA/rtrs: optimize __rtrs_get_permit() by using
find_and_set_bit_lock()
  mISDN: optimize get_free_devid()
  media: em28xx: cx231xx: optimize drivers by using find_and_set_bit()
  ethernet: rocker: optimize ofdpa_port_internal_vlan_id_get()
  serial: sc12is7xx: optimize sc16is7xx_alloc_line()
  bluetooth: optimize cmtp_alloc_block_id()
  net: smc: optimize smc_wr_tx_get_free_slot_index()
  ALSA: use atomic find_bit() functions where applicable
  m68k: optimize get_mmu_context()
  microblaze: optimize get_mmu_context()
  sh: mach-x3proto: optimize ilsel_enable()

 arch/m68k/include/asm/mmu_context.h  |  11 +-
 arch/microblaze/include/asm/mmu_context_mm.h |  11 +-
 arch/mips/sgi-ip30/ip30-irq.c|  12 +-
 arch/powerpc/mm/book3s32/mmu_context.c   |  10 +-
 arch/powerpc/platforms/pasemi/dma_lib.c  |  45 +--
 arch/powerpc/platforms/powernv/pci-sriov.c   |  12 +-
 arch/sh/boards/mach-x3proto/ilsel.c  |   4 +-
 arch/sparc/kernel/pci_msi.c  |   9 +-
 arch/x86/kvm/hyperv.c|  40 +--
 drivers/block/null_blk/main.c|  41 +--
 drivers/dma/idxd/perfmon.c   |   8 +-
 drivers/infiniband/ulp/rtrs/rtrs-clt.c   |  15 +-
 drivers/iommu/arm/arm-smmu/arm-smmu.h|  10 +-
 drivers/iommu/msm_iommu.c|  18 +-
 drivers/isdn/mISDN/core.c|   9 +-
 drivers/media/radio/radio-shark.c|   5 +-
 drivers/media/radio/radio-shark2.c   |   5 +-
 drivers/media/usb/cx231xx/cx231xx-cards.c|  16 +-
 drivers/media/usb/em28xx/em28xx-cards.c  |  37 +--
 drivers/net/ethernet/rocker/rocker_ofdpa.c   |  11 +-
 drivers/net/ethernet/sfc/rx_common.c |   4 +-
 drivers/net/ethernet/sfc/siena/rx_common.c   |   4 +-
 drivers/net/ethernet/sfc/siena/siena_sriov.c |  14 +-
 drivers/net/wireless/ath/ath10k/snoc.c   |   9 +-
 drivers/net/wireless/realtek/rtw88/pci.c |   5 +-
 drivers/net/wireless/realtek/rtw89/pci.c |   5 +-
 drivers/pci/controller/pci-hyperv.c  |   7 +-
 drivers/perf/alibaba_uncore_drw_pmu.c|  10 +-
 drivers/perf/arm-cci.c   |  24 +-
 drivers/perf/arm-ccn.c   |  10 +-
 drivers/perf/arm_dmc620_pmu.c|   9 +-
 drivers/perf/arm_pmuv3.c |   8 +-
 drivers/scsi/mpi3mr/mpi3mr_os.c  |  21 +-
 drivers/scsi/qedi/qedi_main.c|   9 +-
 drivers/scsi/scsi_lib.c  |   7 +-
 drivers/tty/nozomi.c |   5 +-
 drivers/tty/serial/sc16is7xx.c   |   8 +-
 drivers/usb/class/cdc-acm.c  |   5 +-
 include/linux/cpumask.h  |  12 +
 include/linux/find.h | 293 +++
 kernel/sched/sched.h |  14 +-
 kernel/watch_queue.c |   6 +-
 lib/find_bit.c   |  85 ++
 lib/sbitmap.c|  46 +--
 lib/test_bitmap.c

[PATCH v3 02/35] lib/find: add test for atomic find_bit() ops

2023-12-11 Thread Yury Norov

Add basic functionality test for new API.

Signed-off-by: Yury Norov 
---
 lib/test_bitmap.c | 61 +++
 1 file changed, 61 insertions(+)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 65f22c2578b0..277e1ca9fd28 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -221,6 +221,65 @@ static void __init test_zero_clear(void)
expect_eq_pbl("", bmap, 1024);
 }
 
+static void __init test_find_and_bit(void)
+{
+   unsigned long w, w_part, bit, cnt = 0;
+   DECLARE_BITMAP(bmap, EXP1_IN_BITS);
+
+   /*
+* Test find_and_clear{_next}_bit() and corresponding
+* iterators
+*/
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+
+   for_each_test_and_clear_bit(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(w, cnt);
+   expect_eq_uint(0, bitmap_weight(bmap, EXP1_IN_BITS));
+
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+   w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+
+   cnt = 0;
+   bit = EXP1_IN_BITS / 3;
+   for_each_test_and_clear_bit_from(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(bitmap_weight(bmap, EXP1_IN_BITS), bitmap_weight(bmap, 
EXP1_IN_BITS / 3));
+   expect_eq_uint(w_part, bitmap_weight(bmap, EXP1_IN_BITS));
+   expect_eq_uint(w - w_part, cnt);
+
+   /*
+* Test find_and_set{_next}_bit() and corresponding
+* iterators
+*/
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+   cnt = 0;
+
+   for_each_test_and_set_bit(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(EXP1_IN_BITS - w, cnt);
+   expect_eq_uint(EXP1_IN_BITS, bitmap_weight(bmap, EXP1_IN_BITS));
+
+   bitmap_copy(bmap, exp1, EXP1_IN_BITS);
+   w = bitmap_weight(bmap, EXP1_IN_BITS);
+   w_part = bitmap_weight(bmap, EXP1_IN_BITS / 3);
+   cnt = 0;
+
+   bit = EXP1_IN_BITS / 3;
+   for_each_test_and_set_bit_from(bit, bmap, EXP1_IN_BITS)
+   cnt++;
+
+   expect_eq_uint(EXP1_IN_BITS - bitmap_weight(bmap, EXP1_IN_BITS),
+   EXP1_IN_BITS / 3 - bitmap_weight(bmap, EXP1_IN_BITS / 
3));
+   expect_eq_uint(EXP1_IN_BITS * 2 / 3 - (w - w_part), cnt);
+}
+
 static void __init test_find_nth_bit(void)
 {
unsigned long b, bit, cnt = 0;
@@ -1273,6 +1332,8 @@ static void __init selftest(void)
test_for_each_clear_bitrange_from();
test_for_each_set_clump8();
test_for_each_set_bit_wrap();
+
+   test_find_and_bit();
 }
 
 KSTM_MODULE_LOADERS(test_bitmap);
-- 
2.40.1

[PATCH v3 01/35] lib/find: add atomic find_bit() primitives

2023-12-11 Thread Yury Norov

Add helpers around test_and_{set,clear}_bit() that allow to search for
clear or set bits and flip them atomically.

The target patterns may look like this:

for (idx = 0; idx < nbits; idx++)
if (test_and_clear_bit(idx, bitmap))
do_something(idx);

Or like this:

do {
bit = find_first_bit(bitmap, nbits);
if (bit >= nbits)
return nbits;
} while (!test_and_clear_bit(bit, bitmap));
return bit;

In both cases, the opencoded loop may be converted to a single function
or iterator call. Correspondingly:

for_each_test_and_clear_bit(idx, bitmap, nbits)
do_something(idx);

Or:
return find_and_clear_bit(bitmap, nbits);

Obviously, the less routine code people have to write themself, the
less probability to make a mistake.

Those are not only handy helpers but also resolve a non-trivial
issue of using non-atomic find_bit() together with atomic
test_and_{set,clear)_bit().

The trick is that find_bit() implies that the bitmap is a regular
non-volatile piece of memory, and compiler is allowed to use such
optimization techniques like re-fetching memory instead of caching it.

For example, find_first_bit() is implemented like this:

  for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
  val = addr[idx];
  if (val) {
  sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
  break;
  }
  }

On register-memory architectures, like x86, compiler may decide to
access memory twice - first time to compare against 0, and second time
to fetch its value to pass it to __ffs().

When running find_first_bit() on volatile memory, the memory may get
changed in-between, and for instance, it may lead to passing 0 to
__ffs(), which is undefined. This is a potentially dangerous call.

find_and_clear_bit() as a wrapper around test_and_clear_bit()
naturally treats underlying bitmap as a volatile memory and prevents
compiler from such optimizations.

Now that KCSAN is catching exactly this type of situations and warns on
undercover memory modifications. We can use it to reveal improper usage
of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.

In some cases concurrent operations with plain find_bit() are acceptable.
For example:

 - two threads running find_*_bit(): safe wrt ffs(0) and returns correct
   value, because underlying bitmap is unchanged;
 - find_next_bit() in parallel with set or clear_bit(), when modifying
   a bit prior to the start bit to search: safe and correct;
 - find_first_bit() in parallel with set_bit(): safe, but may return wrong
   bit number;
 - find_first_zero_bit() in parallel with clear_bit(): same as above.

In last 2 cases find_bit() may not return a correct bit number, but
it may be OK if caller requires any (not exactly the first) set or clear
bit, correspondingly.

In such cases, KCSAN may be safely silenced with data_race(). But in most
cases where KCSAN detects concurrency people should carefully review their
code and likely protect critical sections or switch to atomic
find_and_bit(), as appropriate.

The 1st patch of the series adds the following atomic primitives:

find_and_set_bit(addr, nbits);
find_and_set_next_bit(addr, nbits, start);
...

Here find_and_{set,clear} part refers to the corresponding
test_and_{set,clear}_bit function. Suffixes like _wrap or _lock
derive their semantics from corresponding find() or test() functions.

For brevity, the naming omits the fact that we search for zero bit in
find_and_set, and correspondingly search for set bit in find_and_clear
functions.

The patch also adds iterators with atomic semantics, like
for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
corresponding atomic operation with 'for_each'.

CC: Bart Van Assche 
CC: Sergey Shtylyov 
Signed-off-by: Yury Norov 
---
 include/linux/find.h | 293 +++
 lib/find_bit.c   |  85 +
 2 files changed, 378 insertions(+)

diff --git a/include/linux/find.h b/include/linux/find.h
index 5e4f39ef2e72..237513356ffa 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -32,6 +32,16 @@ extern unsigned long _find_first_and_bit(const unsigned long 
*addr1,
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned 
long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long 
size);
 
+unsigned long _find_and_set_bit(volatile unsigned long *addr, unsigned long 
nbits);
+unsigned long _find_and_set_next_bit(volatile unsigned long *addr, unsigned 
long nbits,
+   unsigned long start);
+unsigned long _find_and_set_bit_lock(volatile unsigned long *addr, unsigned 
long nbits);
+unsigned long _find_and_set_next_bit_lock(volatile unsigned long *addr,

Re: [PATCH v3 00/35] bitops: add atomic find_bit() operations

2023-12-16 Thread Yury Norov

On Mon, Dec 11, 2023 at 06:27:14PM -0800, Yury Norov wrote:
> Add helpers around test_and_{set,clear}_bit() that allow to search for
> clear or set bits and flip them atomically.
> 
> The target patterns may look like this:
> 
>   for (idx = 0; idx < nbits; idx++)
>   if (test_and_clear_bit(idx, bitmap))
>   do_something(idx);
> 
> Or like this:
> 
>   do {
>   bit = find_first_bit(bitmap, nbits);
>   if (bit >= nbits)
>   return nbits;
>   } while (!test_and_clear_bit(bit, bitmap));
>   return bit;
> 
> In both cases, the opencoded loop may be converted to a single function
> or iterator call. Correspondingly:
> 
>   for_each_test_and_clear_bit(idx, bitmap, nbits)
>   do_something(idx);
> 
> Or:
>   return find_and_clear_bit(bitmap, nbits);
> 
> Obviously, the less routine code people have to write themself, the
> less probability to make a mistake.
> 
> Those are not only handy helpers but also resolve a non-trivial
> issue of using non-atomic find_bit() together with atomic
> test_and_{set,clear)_bit().
> 
> The trick is that find_bit() implies that the bitmap is a regular
> non-volatile piece of memory, and compiler is allowed to use such
> optimization techniques like re-fetching memory instead of caching it.
> 
> For example, find_first_bit() is implemented like this:
> 
>   for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
>   val = addr[idx];
>   if (val) {
>   sz = min(idx * BITS_PER_LONG + __ffs(val), sz);
>   break;
>   }
>   }
> 
> On register-memory architectures, like x86, compiler may decide to
> access memory twice - first time to compare against 0, and second time
> to fetch its value to pass it to __ffs().
> 
> When running find_first_bit() on volatile memory, the memory may get
> changed in-between, and for instance, it may lead to passing 0 to
> __ffs(), which is undefined. This is a potentially dangerous call.
> 
> find_and_clear_bit() as a wrapper around test_and_clear_bit()
> naturally treats underlying bitmap as a volatile memory and prevents
> compiler from such optimizations.
> 
> Now that KCSAN is catching exactly this type of situations and warns on
> undercover memory modifications. We can use it to reveal improper usage
> of find_bit(), and convert it to atomic find_and_*_bit() as appropriate.
> 
> In some cases concurrent operations with plain find_bit() are acceptable.
> For example:
> 
>  - two threads running find_*_bit(): safe wrt ffs(0) and returns correct
>value, because underlying bitmap is unchanged;
>  - find_next_bit() in parallel with set or clear_bit(), when modifying
>a bit prior to the start bit to search: safe and correct;
>  - find_first_bit() in parallel with set_bit(): safe, but may return wrong
>bit number;
>  - find_first_zero_bit() in parallel with clear_bit(): same as above.
> 
> In last 2 cases find_bit() may not return a correct bit number, but
> it may be OK if caller requires any (not exactly the first) set or clear
> bit, correspondingly.
> 
> In such cases, KCSAN may be safely silenced with data_race(). But in most
> cases where KCSAN detects concurrency people should carefully review their
> code and likely protect critical sections or switch to atomic
> find_and_bit(), as appropriate.
> 
> The 1st patch of the series adds the following atomic primitives:
> 
>   find_and_set_bit(addr, nbits);
>   find_and_set_next_bit(addr, nbits, start);
>   ...
> 
> Here find_and_{set,clear} part refers to the corresponding
> test_and_{set,clear}_bit function. Suffixes like _wrap or _lock
> derive their semantics from corresponding find() or test() functions.
> 
> For brevity, the naming omits the fact that we search for zero bit in
> find_and_set, and correspondingly search for set bit in find_and_clear
> functions.
> 
> The patch also adds iterators with atomic semantics, like
> for_each_test_and_set_bit(). Here, the naming rule is to simply prefix
> corresponding atomic operation with 'for_each'.
> 
> In [1] Jan reported 2% slowdown in a single-thread search test when
> switching find_bit() function to treat bitmaps as volatile arrays. On
> the other hand, kernel robot in the same thread reported +3.7% to the
> performance of will-it-scale.per_thread_ops test.
> 
> Assuming that our compilers are sane and generate better code against
> properly annotated data, the above discrepancy doesn't look weird. When
> running on non-volatile bitmaps, plain find_bit() outperforms atomic
> find_and_bit(),

Re: [PATCH] NUMA: Early use of cpu_to_node() returns 0 instead of the correct node id

2024-01-18 Thread Yury Norov

On Fri, Jan 19, 2024 at 11:32:27AM +0800, Huang Shijie wrote:
> hZ7bkEvc+Z19RHkS/HVG3KMg
> X-MS-Exchange-Transport-CrossTenantHeadersStamped: DM8PR01MB7144
> Status: O
> Content-Length: 3779
> Lines: 126
> 
> During the kernel booting, the generic cpu_to_node() is called too early in
> arm64, powerpc and riscv when CONFIG_NUMA is enabled.
> 
> There are at least four places in the common code where
> the generic cpu_to_node() is called before it is initialized:
>  1.) early_trace_init() in kernel/trace/trace.c
>  2.) sched_init()   in kernel/sched/core.c
>  3.) init_sched_fair_class()in kernel/sched/fair.c
>  4.) workqueue_init_early() in kernel/workqueue.c
> 
> In order to fix the bug, the patch changes generic cpu_to_node to
> function pointer, and export it for kernel modules.
> Introduce smp_prepare_boot_cpu_start() to wrap the original
> smp_prepare_boot_cpu(), and set cpu_to_node with early_cpu_to_node.
> Introduce smp_prepare_cpus_done() to wrap the original smp_prepare_cpus(),
> and set the cpu_to_node to formal _cpu_to_node().

This adds another level of indirection, I think. Currently cpu_to_node
is a simple inliner. After the patch it would be a real function with
all the associate overhead. Can you share a bloat-o-meter output here?

Regardless, I don't think that the approach is correct. As per your
description, some initialization functions erroneously call
cpu_to_node() instead of early_cpu_to_node() which exists specifically
for that case.

If the above correct, it's clearly a caller problem, and the fix is to
simply switch all those callers to use early version.

I would also initialize the numa_node with NUMA_NO_NODE at declaration,
so that if someone calls cpu_to_node() before the variable is properly
initialized at runtime, he'll get NO_NODE, which is obviously an error.

Thanks,
Yury
 
> Signed-off-by: Huang Shijie 
> ---
>  drivers/base/arch_numa.c | 11 +++
>  include/linux/topology.h |  6 ++
>  init/main.c  | 29 +++--
>  3 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
> index 5b59d133b6af..867a477fa975 100644
> --- a/drivers/base/arch_numa.c
> +++ b/drivers/base/arch_numa.c
> @@ -61,6 +61,17 @@ EXPORT_SYMBOL(cpumask_of_node);
>  
>  #endif
>  
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> +int _cpu_to_node(int cpu)
> +{
> + return per_cpu(numa_node, cpu);
> +}
> +int (*cpu_to_node)(int cpu);
> +EXPORT_SYMBOL(cpu_to_node);
> +#endif
> +#endif
> +
>  static void numa_update_cpu(unsigned int cpu, bool remove)
>  {
>   int nid = cpu_to_node(cpu);
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 52f5850730b3..e7ce2bae11dd 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -91,10 +91,8 @@ static inline int numa_node_id(void)
>  #endif
>  
>  #ifndef cpu_to_node
> -static inline int cpu_to_node(int cpu)
> -{
> - return per_cpu(numa_node, cpu);
> -}
> +extern int (*cpu_to_node)(int cpu);
> +extern int _cpu_to_node(int cpu);
>  #endif
>  
>  #ifndef set_numa_node
> diff --git a/init/main.c b/init/main.c
> index e24b0780fdff..b142e9c51161 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -870,6 +870,18 @@ static void __init print_unknown_bootoptions(void)
>   memblock_free(unknown_options, len);
>  }
>  
> +static void __init smp_prepare_boot_cpu_start(void)
> +{
> + smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> + /* The early_cpu_to_node should be ready now. */
> + cpu_to_node = early_cpu_to_node;
> +#endif
> +#endif
> +}
> +
>  asmlinkage __visible __init __no_sanitize_address __noreturn 
> __no_stack_protector
>  void start_kernel(void)
>  {
> @@ -899,7 +911,7 @@ void start_kernel(void)
>   setup_command_line(command_line);
>   setup_nr_cpu_ids();
>   setup_per_cpu_areas();
> - smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
> + smp_prepare_boot_cpu_start();
>   boot_cpu_hotplug_init();
>  
>   pr_notice("Kernel command line: %s\n", saved_command_line);
> @@ -1519,6 +1531,19 @@ void __init console_on_rootfs(void)
>   fput(file);
>  }
>  
> +static void __init smp_prepare_cpus_done(unsigned int setup_max_cpus)
> +{
> + /* Different ARCHs may override smp_prepare_cpus() */
> + smp_prepare_cpus(setup_max_cpus);
> +
> +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
> +#ifndef cpu_to_node
> + /* Change to the formal function. */
> + cpu_to_node = _cpu_to_node;
> +#endif
> +#endif
> +}
> +
>  static noinline void __init kernel_init_freeable(void)
>  {
>   /* Now the scheduler is fully set up and can do blocking allocations */
> @@ -1531,7 +1556,7 @@ static noinline void __init kernel_init_freeable(void)
>  
>   cad_pid = get_pid(t

Re: [PATCH] NUMA: Early use of cpu_to_node() returns 0 instead of the correct node id

2024-01-19 Thread Yury Norov

On Fri, Jan 19, 2024 at 04:50:53PM +0800, Shijie Huang wrote:
> 
> 在 2024/1/19 16:42, Mike Rapoport 写道:
> > On Fri, Jan 19, 2024 at 02:46:16PM +0800, Shijie Huang wrote:
> > > 在 2024/1/19 12:42, Yury Norov 写道:
> > > > This adds another level of indirection, I think. Currently cpu_to_node
> > > > is a simple inliner. After the patch it would be a real function with
> > > > all the associate overhead. Can you share a bloat-o-meter output here?
> > > #./scripts/bloat-o-meter vmlinux vmlinux.new
> > > add/remove: 6/1 grow/shrink: 61/51 up/down: 1168/-588 (580)
> > > Function old new   delta
> > > numa_update_cpu  148 244 +96
> > > 
> > >   
> > > ...(to
> > >  many to skip)
> > > 
> > > Total: Before=32990130, After=32990710, chg +0.00%
> > It's not only about text size, the indirect call also hurts performance
> 
> The cpu_to_node() is called at very low frequency, most of the times is in
> the kernel booting time.
 
That doesn't matter. This function is a simple inliner that dereferences
a pointer, and I believe all of us want to keep it simple. 
 
> > > > Regardless, I don't think that the approach is correct. As per your
> > > > description, some initialization functions erroneously call
> > > > cpu_to_node() instead of early_cpu_to_node() which exists specifically
> > > > for that case.
> > > > 
> > > > If the above correct, it's clearly a caller problem, and the fix is to
> > > > simply switch all those callers to use early version.
> > > It is easy to change to early_cpu_to_node() for sched_init(),
> > > init_sched_fair_class()
> > > 
> > > and workqueue_init_early(). These three places call the cpu_to_node() in 
> > > the
> > > __init function.
> > > 
> > > 
> > > But it is a little hard to change the early_trace_init(), since it calls
> > > cpu_to_node in the deep
> > > 
> > > function stack:
> > > 
> > >    early_trace_init() --> ring_buffer_alloc() -->rb_allocate_cpu_buffer()
> > > 
> > > 
> > > For early_trace_init(), we need to change more code.
> > > 
> > > 
> > > Anyway, If we think it is not a good idea to change the common code, I am
> > > oaky too.
> > Is there a fundamental reason to have early_cpu_to_node() at all?
> 
> The early_cpu_to_node does not work on some ARCHs (which support the NUMA),
> such
> 
> as  SPARC, MIPS and S390.

So, your approach wouldn't work either, right? I think you've got a
testing bot report on it already...

You can make it like this:

  #ifdef CONFIG_ARCH_NO_EARLY_CPU_TO_NODE
  #define early_cpu_to_node cpu_to_node
  #endif
 
> > It seems that all the mappings are known by the end of setup_arch() and the
> > initialization of numa_node can be moved earlier.
> > > > I would also initialize the numa_node with NUMA_NO_NODE at declaration,
> > > > so that if someone calls cpu_to_node() before the variable is properly
> > > > initialized at runtime, he'll get NO_NODE, which is obviously an error.
> > > Even we set the numa_node with NUMA_NO_NODE, it does not always produce
> > > error.

You can print this error yourself:

  #ifndef cpu_to_node
  static inline int cpu_to_node(int cpu)
  {
int node = per_cpu(numa_node, cpu);

  #ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (node == NUMA_NO_NODE)
pr_err(...);
  #endif

  return node;
  }
  #endif

Re: [PATCH v2] NUMA: Early use of cpu_to_node() returns 0 instead of the correct node id

2024-01-24 Thread Yury Norov

On Wed, Jan 24, 2024 at 09:19:00AM -0800, Lameter, Christopher wrote:
> On Tue, 23 Jan 2024, Huang Shijie wrote:
> 
> > During the kernel booting, the generic cpu_to_node() is called too early in
> > arm64, powerpc and riscv when CONFIG_NUMA is enabled.
> > 
> > For arm64/powerpc/riscv, there are at least four places in the common code
> > where the generic cpu_to_node() is called before it is initialized:
> >1.) early_trace_init() in kernel/trace/trace.c
> >2.) sched_init()   in kernel/sched/core.c
> >3.) init_sched_fair_class()in kernel/sched/fair.c
> >4.) workqueue_init_early() in kernel/workqueue.c
> > 
> > In order to fix the bug, the patch changes generic cpu_to_node to
> > function pointer, and export it for kernel modules.
> > Introduce smp_prepare_boot_cpu_start() to wrap the original
> > smp_prepare_boot_cpu(), and set cpu_to_node with early_cpu_to_node.
> > Introduce smp_prepare_cpus_done() to wrap the original smp_prepare_cpus(),
> > and set the cpu_to_node to formal _cpu_to_node().
> 
> Would  you please fix this cleanly without a function pointer?
> 
> What I think needs to be done is a patch series.
> 
> 1. Instrument cpu_to_node so that some warning is issued if it is used too
> early. Preloading the array with NUMA_NO_NODE would allow us to do that.

By preloading do you mean compile-time initialization?
 
> 2. Implement early_cpu_to_node on platforms that currently do not have it.
> 
> 3. A series of patches that fix each place where cpu_to_node is used too
> early.

Agree. This is the right way to go. And pretty well all of it was discussed
in v1, isn't?

Thanks,
Yury

Re: [PATCH v3 RESEND 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Yury Norov

On Mon, Feb 12, 2024 at 08:56:32AM +0100, Herve Codina wrote:
> The bitmap_onto() function translates one bitmap relative to another but
> no function are present to perform the reverse translation.
> 
> Introduce bitmap_off() to fill this hole.
> 
> Signed-off-by: Herve Codina 
> ---
>  include/linux/bitmap.h |  3 +++
>  lib/bitmap.c   | 42 ++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 99451431e4d6..5ecfcbbc91f4 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -65,6 +65,7 @@ struct device;
>   *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
>   *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, 
> new)(oldbit)
>   *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to 
> relmap
> + *  bitmap_off(dst, orig, relmap, nbits)*dst = bitmap_onto() reverse 
> operation
>   *  bitmap_fold(dst, orig, sz, nbits)   dst bits = orig bits mod sz
>   *  bitmap_parse(buf, buflen, dst, nbits)   Parse bitmap dst from kernel 
> buf
>   *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user 
> buf
> @@ -208,6 +209,8 @@ int bitmap_bitremap(int oldbit,
>   const unsigned long *old, const unsigned long *new, int bits);
>  void bitmap_onto(unsigned long *dst, const unsigned long *orig,
>   const unsigned long *relmap, unsigned int bits);
> +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> + const unsigned long *relmap, unsigned int bits);
>  void bitmap_fold(unsigned long *dst, const unsigned long *orig,
>   unsigned int sz, unsigned int nbits);
>  
> diff --git a/lib/bitmap.c b/lib/bitmap.c
> index 2feccb5047dc..71343967335e 100644
> --- a/lib/bitmap.c
> +++ b/lib/bitmap.c
> @@ -682,6 +682,48 @@ void bitmap_onto(unsigned long *dst, const unsigned long 
> *orig,
>  }
>  EXPORT_SYMBOL(bitmap_onto);
>  
> +/**
> + * bitmap_off - revert operation done by bitmap_onto()

This is definitely a bad name. I've no a better idea, but even
bitmap_onto_revert() would be better.

> + * @dst: resulting translated bitmap
> + * @orig: original untranslated bitmap
> + * @relmap: bitmap relative to which translated
> + * @bits: number of bits in each of these bitmaps
> + *
> + * Suppose onto computed using bitmap_onto(onto, src, relmap, n)
> + * The operation bitmap_off(result, onto, relmap, n) leads to a
> + * result equal or equivalent to src.

Agree with Rasmus. This should be well tested.

> + * The result can be 'equivalent' because bitmap_onto() and
> + * bitmap_off() are not bijective.
> + * The result and src values are equivalent in that sense that a
> + * call to bitmap_onto(onto, src, relmap, n) and a call to
> + * bitmap_onto(onto, result, relmap, n) will lead to the same onto
> + * value.

Did you mean "a call to bitmap_onto(onto, src, relmap, n) and a
call to bitmap_off(onto, result, relmap, n)"? 

I think the whole paragraph adds more confusion than explanations.
If a new function is supposed to revert the result of some other
function, I'd better focus on testing that it actually reverts as
advertised, and keep description as brief as possible.

> + * If either of @orig or @relmap is empty (no set bits), then @dst
> + * will be returned empty.

Is this an exception from the 'revert' policy? Doesn't look like that.
So, what for mentioning this specific case?

> + * All bits in @dst not set by the above rule are cleared.

The above rule is about empty @orig and @relmap, not about setting
bits. What did you mean here?

> + */
> +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> + const unsigned long *relmap, unsigned int bits)
> +{
> + unsigned int n, m;  /* same meaning as in above comment */

In the above comment, n means the size of bitmaps, and m is not
mentioned at all.

> + if (dst == orig)/* following doesn't handle inplace mappings */
> + return;
> + bitmap_zero(dst, bits);

Can you add an empty line after 'return'.

> + m = 0;
> + for_each_set_bit(n, relmap, bits) {
> + /* m == bitmap_pos_to_ord(relmap, n, bits) */

Don't think we need this comment here. If you want to underline that
m tracks bit order, can you just give it a more explanatory name. For
example, 'bit_order'.

> + if (test_bit(n, orig))
> + set_bit(m, dst);
> + m++;
> + }
> +}
> +EXPORT_SYMBOL(bitmap_off);
> +
>  #ifdef CONFIG_NUMA
>  /**
>   * bitmap_fold - fold larger bitmap into smaller, modulo specified size
> -- 
> 2.43.0

Re: [PATCH v3 RESEND 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Yury Norov

On Mon, Feb 12, 2024 at 10:37:18AM -0800, Yury Norov wrote:
> On Mon, Feb 12, 2024 at 08:56:32AM +0100, Herve Codina wrote:
> > The bitmap_onto() function translates one bitmap relative to another but
> > no function are present to perform the reverse translation.
> > 
> > Introduce bitmap_off() to fill this hole.
> > 
> > Signed-off-by: Herve Codina 
> > ---
> >  include/linux/bitmap.h |  3 +++
> >  lib/bitmap.c   | 42 ++
> >  2 files changed, 45 insertions(+)
> > 
> > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> > index 99451431e4d6..5ecfcbbc91f4 100644
> > --- a/include/linux/bitmap.h
> > +++ b/include/linux/bitmap.h
> > @@ -65,6 +65,7 @@ struct device;
> >   *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
> >   *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, 
> > new)(oldbit)
> >   *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to 
> > relmap
> > + *  bitmap_off(dst, orig, relmap, nbits)*dst = bitmap_onto() 
> > reverse operation
> >   *  bitmap_fold(dst, orig, sz, nbits)   dst bits = orig bits mod sz
> >   *  bitmap_parse(buf, buflen, dst, nbits)   Parse bitmap dst from 
> > kernel buf
> >   *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user 
> > buf
> > @@ -208,6 +209,8 @@ int bitmap_bitremap(int oldbit,
> > const unsigned long *old, const unsigned long *new, int bits);
> >  void bitmap_onto(unsigned long *dst, const unsigned long *orig,
> > const unsigned long *relmap, unsigned int bits);
> > +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> > +   const unsigned long *relmap, unsigned int bits);
> >  void bitmap_fold(unsigned long *dst, const unsigned long *orig,
> > unsigned int sz, unsigned int nbits);
> >  
> > diff --git a/lib/bitmap.c b/lib/bitmap.c
> > index 2feccb5047dc..71343967335e 100644
> > --- a/lib/bitmap.c
> > +++ b/lib/bitmap.c
> > @@ -682,6 +682,48 @@ void bitmap_onto(unsigned long *dst, const unsigned 
> > long *orig,
> >  }
> >  EXPORT_SYMBOL(bitmap_onto);
> >  
> > +/**
> > + * bitmap_off - revert operation done by bitmap_onto()
> 
> This is definitely a bad name. I've no a better idea, but even
> bitmap_onto_revert() would be better.
> 
> > + * @dst: resulting translated bitmap
> > + * @orig: original untranslated bitmap
> > + * @relmap: bitmap relative to which translated
> > + * @bits: number of bits in each of these bitmaps
> > + *
> > + * Suppose onto computed using bitmap_onto(onto, src, relmap, n)
> > + * The operation bitmap_off(result, onto, relmap, n) leads to a
> > + * result equal or equivalent to src.
> 
> Agree with Rasmus. This should be well tested.
> 
> > + * The result can be 'equivalent' because bitmap_onto() and
> > + * bitmap_off() are not bijective.
> > + * The result and src values are equivalent in that sense that a
> > + * call to bitmap_onto(onto, src, relmap, n) and a call to
> > + * bitmap_onto(onto, result, relmap, n) will lead to the same onto
> > + * value.
> 
> Did you mean "a call to bitmap_onto(onto, src, relmap, n) and a
> call to bitmap_off(onto, result, relmap, n)"? 
> 
> I think the whole paragraph adds more confusion than explanations.
> If a new function is supposed to revert the result of some other
> function, I'd better focus on testing that it actually reverts as
> advertised, and keep description as brief as possible.
> 
> > + * If either of @orig or @relmap is empty (no set bits), then @dst
> > + * will be returned empty.
> 
> Is this an exception from the 'revert' policy? Doesn't look like that.
> So, what for mentioning this specific case?
> 
> > + * All bits in @dst not set by the above rule are cleared.
> 
> The above rule is about empty @orig and @relmap, not about setting
> bits. What did you mean here?
> 
> > + */
> > +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> > +   const unsigned long *relmap, unsigned int bits)
> > +{
> > +   unsigned int n, m;  /* same meaning as in above comment */
> 
> In the above comment, n means the size of bitmaps, and m is not
> mentioned at all.
> 
> > +   if (dst == orig)/* following doesn't handle inplace mappings */
> > +   return;
> > +   bitmap_zero(dst, bits);
> 
> Can you add an empty line after 'return'.
> 
> >

Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Yury Norov

On Mon, Feb 12, 2024 at 04:36:36PM +0200, Andy Shevchenko wrote:
> On Mon, Feb 12, 2024 at 03:20:22PM +0100, Herve Codina wrote:
> > On Mon, 12 Feb 2024 16:01:38 +0200
> > Andy Shevchenko  wrote:
> 
> ...
> 
> > Agree, the bitmap_onto() code is simpler to understand than its help.
> > 
> > I introduced bitmap_off() to be the "reverse" bitmap_onto() operations
> > and I preferred to avoid duplicating function that do the same things.
> > 
> > On my side, I initially didn't use the bitmap_*() functions and did the the
> > bits manipulation by hand.
> > During the review, it was suggested to use the bitmap_*() family and I 
> > followed
> > this suggestion.
> 
> I also would go this way, the problems I see with the current implementation 
> are:

Sure, opencoding and duplicating the functionality is always a bad
idea.

> - being related to NUMA (and as Rasmus once pointed out better to be there);

It's 'related to NUMA' for the only reason - it's used by NUMA only.
Nothing NUMA-specific in the function itself.

Now that we've got a non-NUMA user, the bitmap_onto() is not related
to NUMA anymore.

> - unclear naming, esp. proposed bitmap_off();

That's I agree. Scatter/gather from your last approach sound better.
Do you plan to send a v2?

> - the quite hard to understand help text

Yes, we need a picture that would illustrate what actually happens

> - atomicity when it's not needed (AFAICT).

Agree. A series of atomic ops is not atomic. For example

if (test_bit(n, map))
set_bit(m, map);

is not atomic as a whole. And this is what we do in bitmap_onto/off()
in a loop. This must be fixed by using underscoded version.

> > I did tests to be sure that bitmap_onto() and bitmap_off() did
> > exactly the same things as my previous code did.
> 
> Yuri, what do you think about all this?

I think your scatter/gather is better then this onto/off by naming and
implementation. If you'll send a v2, and it would work for Herve, I'd
prefer scatter/gather. But we can live with onto/off as well.

Thanks,
Yury

Re: [PATCH v4 3/5] lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers

2024-02-22 Thread Yury Norov

On Thu, Feb 22, 2024 at 05:49:59PM +0100, Herve Codina wrote:
> Hi Andy, Yury,
> 
> On Thu, 22 Feb 2024 17:39:27 +0200
> Andy Shevchenko  wrote:
> 
> ...
> > > + * bitmap_scatter() for the bitmap scatter detailed operations).  
> > 
> > > + * Suppose scattered computed using bitmap_scatter(scattered, src, mask, 
> > > n).
> > > + * The operation bitmap_gather(result, scattered, mask, n) leads to a 
> > > result
> > > + * equal or equivalent to src.  
> > 
> > This paragraph...
> > 
> > > + * The result can be 'equivalent' because bitmap_scatter() and 
> > > bitmap_gather()
> > > + * are not bijective.  
> > 
> > 
> > > + * The result and src values are equivalent in that sense that a call to
> > > + * bitmap_scatter(res, src, mask, n) and a call to bitmap_scatter(res, 
> > > result,
> > > + * mask, n) will lead to the same res value.  
> > 
> > ...seems duplicating this one.
> > 
> > I would drop the latter one.
> 
> I would like to give details about the 'equivalent' in this scatter/gather 
> case.

If you would like - please do! :)
 
> If Yury is ok, I can drop this last paragraph.

The original bitmap_onto() description is 3 times longer, and barely
that descriptive. I'm OK with your working and especially pictures.

Thanks,
Yury

[PATCH 1/2] sched/topology: introduce node_has_cpus() macro

2023-02-21 Thread Yury Norov

Currently, to check if NUMA node has CPUs, one has to use the
nr_cpus_node() macro, which ends up with cpumask_weight. We can do it
better with cpumask_empty(), because the latter can potentially return
earlier - as soon as 1st set bit found.

This patch adds a node_has_cpus() macro to implement that.

Signed-off-by: Yury Norov 
---
 include/linux/topology.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c7..7e0d8f8f5a39 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -39,9 +39,11 @@
 #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
 #endif
 
+#define node_has_cpus(node) (!cpumask_empty(cpumask_of_node(node)))
+
 #define for_each_node_with_cpus(node)  \
for_each_online_node(node)  \
-   if (nr_cpus_node(node))
+   if (node_has_cpus(node))
 
 int arch_update_cpu_topology(void);
 
-- 
2.34.1

[PATCH 2/2] powerpc: use node_has_cpus() instead of nr_cpus_node()

2023-02-21 Thread Yury Norov

Use node_has_cpus() as more efficient alternative to nr_cpus_node()
where possible.

Signed-off-by: Yury Norov 
---
 arch/powerpc/platforms/cell/spu_priv1_mmio.c | 2 +-
 arch/powerpc/platforms/cell/spufs/sched.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_priv1_mmio.c 
b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
index d150e3987304..55b5024b256b 100644
--- a/arch/powerpc/platforms/cell/spu_priv1_mmio.c
+++ b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
@@ -64,7 +64,7 @@ static void cpu_affinity_set(struct spu *spu, int cpu)
u64 target;
u64 route;
 
-   if (nr_cpus_node(spu->node)) {
+   if (node_has_cpus(spu->node)) {
const struct cpumask *spumask = cpumask_of_node(spu->node),
*cpumask = cpumask_of_node(cpu_to_node(cpu));
 
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c 
b/arch/powerpc/platforms/cell/spufs/sched.c
index 99bd027a7f7c..9d29cc2c6bcb 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -154,7 +154,7 @@ void spu_update_sched_info(struct spu_context *ctx)
 
 static int __node_allowed(struct spu_context *ctx, int node)
 {
-   if (nr_cpus_node(node)) {
+   if (node_has_cpus(node)) {
const struct cpumask *mask = cpumask_of_node(node);
 
if (cpumask_intersects(mask, &ctx->cpus_allowed))
-- 
2.34.1

Re: [PATCH v6 4/5] net: wan: fsl_qmc_hdlc: Add runtime timeslots changes support

2024-03-06 Thread Yury Norov

On Wed, Mar 06, 2024 at 09:07:20AM +0100, Herve Codina wrote:
> QMC channels support runtime timeslots changes but nothing is done at
> the QMC HDLC driver to handle these changes.
> 
> Use existing IFACE ioctl in order to configure the timeslots to use.
> 
> Signed-off-by: Herve Codina 
> Reviewed-by: Christophe Leroy 
> Acked-by: Jakub Kicinski 
> Reviewed-by: Andy Shevchenko 
> ---
>  drivers/net/wan/fsl_qmc_hdlc.c | 151 -
>  1 file changed, 150 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
> index 90063a92209e..31c0f32474a3 100644
> --- a/drivers/net/wan/fsl_qmc_hdlc.c
> +++ b/drivers/net/wan/fsl_qmc_hdlc.c
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -39,6 +40,7 @@ struct qmc_hdlc {
>   struct qmc_hdlc_desc tx_descs[8];
>   unsigned int tx_out;
>   struct qmc_hdlc_desc rx_descs[4];
> + u32 slot_map;
>  };
>  
>  static struct qmc_hdlc *netdev_to_qmc_hdlc(struct net_device *netdev)
> @@ -203,6 +205,144 @@ static netdev_tx_t qmc_hdlc_xmit(struct sk_buff *skb, 
> struct net_device *netdev)
>   return NETDEV_TX_OK;
>  }
>  
> +static int qmc_hdlc_xlate_slot_map(struct qmc_hdlc *qmc_hdlc,
> +u32 slot_map, struct qmc_chan_ts_info 
> *ts_info)
> +{
> + DECLARE_BITMAP(ts_mask_avail, 64);
> + DECLARE_BITMAP(ts_mask, 64);
> + DECLARE_BITMAP(map, 64);
> +
> + /* Tx and Rx available masks must be identical */
> + if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) {
> + dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch 
> (0x%llx, 0x%llx)\n",
> + ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail);
> + return -EINVAL;
> + }
> +
> + bitmap_from_u64(ts_mask_avail, ts_info->rx_ts_mask_avail);
> + bitmap_from_u64(map, slot_map);
> + bitmap_scatter(ts_mask, map, ts_mask_avail, 64);

We've got a BITMAP_FROM_U64() for this:

DECLARE_BITMAP(ts_mask_avail, 64) = { 
BITMAP_FROM_U64(ts_info->rx_ts_mask_avail) };
DECLARE_BITMAP(map, 64) = { BITMAP_FROM_U64(slot_map) };

> +
> + if (bitmap_weight(ts_mask, 64) != bitmap_weight(map, 64)) {
> + dev_err(qmc_hdlc->dev, "Cannot translate timeslots %64pb -> 
> (%64pb, %64pb)\n",
> + map, ts_mask_avail, ts_mask);
> + return -EINVAL;
> + }
> +
> + bitmap_to_arr64(&ts_info->tx_ts_mask, ts_mask, 64);
> + ts_info->rx_ts_mask = ts_info->tx_ts_mask;
> + return 0;
> +}
> +
> +static int qmc_hdlc_xlate_ts_info(struct qmc_hdlc *qmc_hdlc,
> +   const struct qmc_chan_ts_info *ts_info, u32 
> *slot_map)
> +{
> + DECLARE_BITMAP(ts_mask_avail, 64);
> + DECLARE_BITMAP(ts_mask, 64);
> + DECLARE_BITMAP(map, 64);
> + u32 array32[2];

NIT. Bad name. I'd suggest slot_array, or something.

> + /* Tx and Rx masks and available masks must be identical */
> + if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) {
> + dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch 
> (0x%llx, 0x%llx)\n",
> + ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail);
> + return -EINVAL;
> + }
> + if (ts_info->rx_ts_mask != ts_info->tx_ts_mask) {
> + dev_err(qmc_hdlc->dev, "tx and rx timeslots mismatch (0x%llx, 
> 0x%llx)\n",
> + ts_info->rx_ts_mask, ts_info->tx_ts_mask);
> + return -EINVAL;
> + }
> +
> + bitmap_from_u64(ts_mask_avail, ts_info->rx_ts_mask_avail);
> + bitmap_from_u64(ts_mask, ts_info->rx_ts_mask);

Same as above, can you try using BITMAP_FROM_U64()?

Thanks,
Yury

> + bitmap_gather(map, ts_mask, ts_mask_avail, 64);
> +
> + if (bitmap_weight(ts_mask, 64) != bitmap_weight(map, 64)) {
> + dev_err(qmc_hdlc->dev, "Cannot translate timeslots (%64pb, 
> %64pb) -> %64pb\n",
> + ts_mask_avail, ts_mask, map);
> + return -EINVAL;
> + }
> +
> + bitmap_to_arr32(array32, map, 64);
> + if (array32[1]) {
> + dev_err(qmc_hdlc->dev, "Slot map out of 32bit (%64pb, %64pb) -> 
> %64pb\n",
> + ts_mask_avail, ts_mask, map);
> + return -EINVAL;
> + }
> +
> + *slot_map = array32[0];
> + return 0;
> +}
> +
> +static int qmc_hdlc_set_iface(struct qmc_hdlc *qmc_hdlc, int if_iface, const 
> te1_settings *te1)
> +{
> + struct qmc_chan_ts_info ts_info;
> + int ret;
> +
> + ret = qmc_chan_get_ts_info(qmc_hdlc->qmc_chan, &ts_info);
> + if (ret) {
> + dev_err(qmc_hdlc->dev, "get QMC channel ts info failed %d\n", 
> ret);
> + return ret;
> + }
> + ret = qmc_hdlc_xlate_slot_map(qmc_hdlc, te1->slot_map, &ts_info);
> + if (ret)
> + return ret;
> +
> + ret = qmc_chan_set_ts_info(qmc_hdlc->qmc_c

Re: [PATCH v6 3/5] lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers

2024-03-06 Thread Yury Norov

On Wed, Mar 06, 2024 at 09:07:19AM +0100, Herve Codina wrote:
> From: Andy Shevchenko 
> 
> These helpers scatters or gathers a bitmap with the help of the mask
> position bits parameter.
> 
> bitmap_scatter() does the following:
>   src:  01011010
>   ||
>+--+|
>|  ++
>|  |++|||
>|  ||   +-+||
>|  ||   |  ||
>   mask: ...v..vv...v..vv
> ...0..11...0..10
>   dst:  00110010
> 
> and bitmap_gather() performs this one:
>mask: ...v..vv...v..vv
>src:  00110010
> ^  ^^   ^   0
> |  ||   |  10
> |  ||   > 010
> |  |+--> 1010
> |  +--> 11010
> +> 011010
>dst:  00011010
> 
> bitmap_gather() can the seen as the reverse bitmap_scatter() operation.
> 
> Signed-off-by: Andy Shevchenko 
> Link: 
> https://lore.kernel.org/lkml/20230926052007.3917389-3-andriy.shevche...@linux.intel.com/
> Co-developed-by: Herve Codina 
> Signed-off-by: Herve Codina 

Signed-off-by: Yury Norov 

Would you like to move this with the rest of the series? If so please
pull my Sof-by, otherwise I can move it with bitmap-for-next.

> ---
>  include/linux/bitmap.h | 101 +
>  lib/test_bitmap.c  |  42 +
>  2 files changed, 143 insertions(+)
> 
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 99451431e4d6..049ba20911c5 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -62,6 +62,8 @@ struct device;
>   *  bitmap_shift_left(dst, src, n, nbits)   *dst = *src << n
>   *  bitmap_cut(dst, src, first, n, nbits)   Cut n bits from first, copy 
> rest
>   *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | 
> (*new & *mask)
> + *  bitmap_scatter(dst, src, mask, nbits)*dst = map(dense, sparse)(src)
> + *  bitmap_gather(dst, src, mask, nbits) *dst = map(sparse, dense)(src)
>   *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
>   *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, 
> new)(oldbit)
>   *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to 
> relmap
> @@ -487,6 +489,105 @@ static inline void bitmap_replace(unsigned long *dst,
>   __bitmap_replace(dst, old, new, mask, nbits);
>  }
>  
> +/**
> + * bitmap_scatter - Scatter a bitmap according to the given mask
> + * @dst: scattered bitmap
> + * @src: gathered bitmap
> + * @mask: mask representing bits to assign to in the scattered bitmap
> + * @nbits: number of bits in each of these bitmaps
> + *
> + * Scatters bitmap with sequential bits according to the given @mask.
> + *
> + * Example:
> + * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
> + *
> + * Or in binary form
> + * @src  @mask   @dst
> + * 01011010  000100110001001100110010
> + *
> + * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
> + *
> + * A more 'visual' description of the operation:
> + * src:  01011010
> + * ||
> + *  +--+|
> + *  |  ++
> + *  |  |++|||
> + *  |  ||   +-+||
> + *  |  ||   |  ||
> + * mask: ...v..vv...v..vv
> + *   ...0..11...0..10
> + * dst:  00110010
> + *
> + * A relationship exists between bitmap_scatter() and bitmap_gather().
> + * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
> + * See bitmap_scatter() for details related to this relationship.
> + */
> +static inline void bitmap_scatter(unsigned long *dst, const unsigned long 
> *src,
> +   const unsigned long *mask, unsigned int nbits)
> +{
> + unsigned int n = 0;
> + unsigned int bit;
> +
> + bitmap_zero(dst, nbits);
> +
> + for_each_set_bit(bit, mask, nbits)
> + __assign_bit(bit, dst, test_bit(n++, src));
> +}
> +
> +/**
> + * bitmap_gather - Gather a bitmap according to given mask
> + * @dst: gathered bitmap
> + * @src: scattered bitmap
> + * @mask: mask representing bits to extract from in the scattered bitmap
> + * @nbits: number of bits in each of these bitmaps
> + *
> + * Gathers bitmap with sparse bits according to the given @mask.
> + *
> + * Example:
> + * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
> + *
> + * Or in binary form
> + * @src  @mask   @dst
> + * 00110010  00

Re: [PATCH v6 1/5] net: wan: Add support for QMC HDLC

2024-03-06 Thread Yury Norov

On Wed, Mar 06, 2024 at 09:07:17AM +0100, Herve Codina wrote:
> The QMC HDLC driver provides support for HDLC using the QMC (QUICC
> Multichannel Controller) to transfer the HDLC data.
> 
> Signed-off-by: Herve Codina 
> Reviewed-by: Christophe Leroy 
> Acked-by: Jakub Kicinski 
> Reviewed-by: Andy Shevchenko 
> ---
>  drivers/net/wan/Kconfig|  12 +
>  drivers/net/wan/Makefile   |   1 +
>  drivers/net/wan/fsl_qmc_hdlc.c | 413 +
>  3 files changed, 426 insertions(+)
>  create mode 100644 drivers/net/wan/fsl_qmc_hdlc.c
> 
> diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig
> index 7dda87756d3f..31ab2136cdf1 100644
> --- a/drivers/net/wan/Kconfig
> +++ b/drivers/net/wan/Kconfig
> @@ -197,6 +197,18 @@ config FARSYNC
> To compile this driver as a module, choose M here: the
> module will be called farsync.
>  
> +config FSL_QMC_HDLC
> + tristate "Freescale QMC HDLC support"
> + depends on HDLC
> + depends on CPM_QMC
> + help
> +   HDLC support using the Freescale QUICC Multichannel Controller (QMC).
> +
> +   To compile this driver as a module, choose M here: the
> +   module will be called fsl_qmc_hdlc.
> +
> +   If unsure, say N.
> +
>  config FSL_UCC_HDLC
>   tristate "Freescale QUICC Engine HDLC support"
>   depends on HDLC
> diff --git a/drivers/net/wan/Makefile b/drivers/net/wan/Makefile
> index 8119b49d1da9..00e9b7ee1e01 100644
> --- a/drivers/net/wan/Makefile
> +++ b/drivers/net/wan/Makefile
> @@ -25,6 +25,7 @@ obj-$(CONFIG_WANXL) += wanxl.o
>  obj-$(CONFIG_PCI200SYN)  += pci200syn.o
>  obj-$(CONFIG_PC300TOO)   += pc300too.o
>  obj-$(CONFIG_IXP4XX_HSS) += ixp4xx_hss.o
> +obj-$(CONFIG_FSL_QMC_HDLC)   += fsl_qmc_hdlc.o
>  obj-$(CONFIG_FSL_UCC_HDLC)   += fsl_ucc_hdlc.o
>  obj-$(CONFIG_SLIC_DS26522)   += slic_ds26522.o
>  
> diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
> new file mode 100644
> index ..90063a92209e
> --- /dev/null
> +++ b/drivers/net/wan/fsl_qmc_hdlc.c
> @@ -0,0 +1,413 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Freescale QMC HDLC Device Driver
> + *
> + * Copyright 2023 CS GROUP France
> + *
> + * Author: Herve Codina 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +struct qmc_hdlc_desc {
> + struct net_device *netdev;
> + struct sk_buff *skb; /* NULL if the descriptor is not in use */
> + dma_addr_t dma_addr;
> + size_t dma_size;
> +};
> +
> +struct qmc_hdlc {
> + struct device *dev;
> + struct qmc_chan *qmc_chan;
> + struct net_device *netdev;
> + bool is_crc32;
> + spinlock_t tx_lock; /* Protect tx descriptors */
> + struct qmc_hdlc_desc tx_descs[8];
> + unsigned int tx_out;
> + struct qmc_hdlc_desc rx_descs[4];
> +};
> +
> +static struct qmc_hdlc *netdev_to_qmc_hdlc(struct net_device *netdev)
> +{
> + return dev_to_hdlc(netdev)->priv;
> +}
> +
> +static int qmc_hdlc_recv_queue(struct qmc_hdlc *qmc_hdlc, struct 
> qmc_hdlc_desc *desc, size_t size);
> +
> +#define QMC_HDLC_RX_ERROR_FLAGS  \
> + (QMC_RX_FLAG_HDLC_OVF | QMC_RX_FLAG_HDLC_UNA |  \
> +  QMC_RX_FLAG_HDLC_CRC | QMC_RX_FLAG_HDLC_ABORT)
> +
> +static void qmc_hcld_recv_complete(void *context, size_t length, unsigned 
> int flags)
> +{
> + struct qmc_hdlc_desc *desc = context;
> + struct net_device *netdev = desc->netdev;
> + struct qmc_hdlc *qmc_hdlc = netdev_to_qmc_hdlc(netdev);
> + int ret;
> +
> + dma_unmap_single(qmc_hdlc->dev, desc->dma_addr, desc->dma_size, 
> DMA_FROM_DEVICE);
> +
> + if (flags & QMC_HDLC_RX_ERROR_FLAGS) {
> + netdev->stats.rx_errors++;
> + if (flags & QMC_RX_FLAG_HDLC_OVF) /* Data overflow */
> + netdev->stats.rx_over_errors++;
> + if (flags & QMC_RX_FLAG_HDLC_UNA) /* bits received not multiple 
> of 8 */
> + netdev->stats.rx_frame_errors++;
> + if (flags & QMC_RX_FLAG_HDLC_ABORT) /* Received an abort 
> sequence */
> + netdev->stats.rx_frame_errors++;
> + if (flags & QMC_RX_FLAG_HDLC_CRC) /* CRC error */
> + netdev->stats.rx_crc_errors++;

It's minor, but you can avoid conditionals doing something like:

netdev->stats.rx_over_errors += !!(flags & 
QMC_RX_FLAG_HDLC_OVF);

Thanks,
Yury

> + kfree_skb(desc->skb);
> + } else {
> + netdev->stats.rx_packets++;
> + netdev->stats.rx_bytes += length;
> +
> + skb_put(desc->skb, length);
> + desc->skb->protocol = hdlc_type_trans(desc->skb, netdev);
> + netif_rx(desc->skb);
> + }
> +
> + /* Re-queue a transfer using the same descriptor */
> + ret = qmc_hdlc_

Re: [PATCH v6 3/5] lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers

2024-03-07 Thread Yury Norov

On Thu, Mar 7, 2024, 2:31 AM Herve Codina  wrote:

> Hi Yury,
>
> On Wed, 6 Mar 2024 15:39:06 +0200
> Andy Shevchenko  wrote:
>
> > On Wed, Mar 06, 2024 at 05:11:19AM -0800, Yury Norov wrote:
> > > On Wed, Mar 06, 2024 at 09:07:19AM +0100, Herve Codina wrote:
> >
> > ...
> >
> > > Signed-off-by: Yury Norov 
> >
> > Why? Shouldn't be Acked-by?
> >
> > > Would you like to move this with the rest of the series? If so please
> > > pull my Sof-by, otherwise I can move it with bitmap-for-next.
> >
>
> A new iteration of the series is planned.
> Yury, may I add your Acked-by in the next iteration ?
>
> Best regards,
> Hervé
>

Please scratch my sign-off and add ack.

>

Re: [PATCH v3 2/2] x86: Fix /proc/cpuinfo cpumask warning

2022-11-03 Thread yury . norov

On Thu, Nov 03, 2022 at 04:34:04PM +0100, Andrew Jones wrote:
> On Thu, Nov 03, 2022 at 04:02:12PM +0100, Borislav Petkov wrote:
> > On Thu, Nov 03, 2022 at 01:59:45PM +0100, Andrew Jones wrote:
> > > The patch I'm proposing ensures cpumask_next()'s range, which is actually
> > > [-1, nr_cpus_ids - 1),
> > 
> > Lemme make sure I understand it correctly: on the upper boundary, if you
> > supply for n the value nr_cpu_ids - 2, then it will return potentially
> > the last bit if the mask is set, i.e., the one at position (nr_cpu_ids - 1).
> > 
> > If you supply nr_cpus_ids - 1, then it'll return nr_cpu_ids to signal no
> > further bits set.
> > 
> > Yes, no?
> 
> Yes
> 
> > 
> > > I'll send a v4 with another stab at the commit message.
> > 
> > Yes, and it is still an unreadable mess: "A kernel compiled with commit
> > ... but not its revert... " Nope.
> > 
> > First make sure cpumask_next()'s valid accepted range has been settled
> > upon, has been explicitly documented in a comment above it and then I'll
> > take a patch that fixes whatever is there to fix.
> 
> That's fair, but I'll leave that to Yury.

I'll take care of it.

> > Callers should not have to filter values before passing them in - the
> > function either returns an error or returns the next bit in the mask.
> 
> That's reasonable, but cpumask folk probably need to discuss it because
> not all cpumask functions have a return value where an error may be
> placed.

Callers should pass sane arguments into internal functions if they
expect sane output. The API not exported to userspace shouldn't
sanity-check all inputs arguments. For example, cpumask_next() doesn't
check srcp for NULL.

However, cpumask API is exposed to drivers, and that's why optional
cpumask_check() exists. (Probably. It has been done long before I took
over this.)

Current *generic* implementation guarantees that out-of-region offset
would prevent cpumask_next() from dereferencing srcp, and makes it
returning nr_cpu_ids. This behavior is expected by many callers. However,
there is a couple of non-generic cpumask implementations, and one of
them is written in assembler. So, the portable code shouldn't expect
from cpumasks more than documentation said: for a _valid_ offset
cpumask_next() returns next set bit or >= nr_cpu_ids.

cpumask_check() has been broken for years. Attempting to fix it faced
so much resistance, that I had to revert the patch. Now there's
ongoing discussion whether we need this check at all. My opinion is
that if all implementations of cpumask (more precisely, underlying
bitmap API) are safe against out-of-range offset, we can simply remove
cpumask_check(). Those users, like cpuinfo, who waste time on useless
last iteration will bear it themselves. 

Thanks,
Yury

Re: [PATCH v3 2/2] x86: Fix /proc/cpuinfo cpumask warning

2022-11-03 Thread Yury Norov

On Thu, Nov 03, 2022 at 05:49:06PM +0100, Borislav Petkov wrote:
> On Thu, Nov 03, 2022 at 09:30:54AM -0700, yury.no...@gmail.com wrote:a
> > Callers should pass sane arguments into internal functions if they
> > expect sane output.
> 
> What internal function? It's in a global header.
> 
> > The API not exported to userspace shouldn't sanity-check all inputs
> > arguments.
> 
> That doesn't have anything to do with userspace at all.
> 
> APIs exported to the rest of the kernel should very well check their
> inputs. Otherwise they're not APIs - just some random functions which
> are visible to the compiler.

Let's take for example cpu_llc_shared_mask() added by you in
arch/x86/include/asm/smp.h recently:

  static inline struct cpumask *cpu_llc_shared_mask(int cpu)
  {
 return per_cpu(cpu_llc_shared_map, cpu);
  }

It's in a global header and available to the rest of the kernel, just as
well. How does it check its input? Maybe I lost something important in
per_cpu() internals, but at the first glance, there's no any protection
against -1, nr_cpu_ids, and other out-of-range arguments.

Re: [PATCH v4 1/1] x86: cpuinfo: Ensure inputs to cpumask_next are valid

2022-11-10 Thread Yury Norov

On Thu, Nov 03, 2022 at 03:25:04PM +0100, Andrew Jones wrote:
> The valid cpumask range is [0, nr_cpu_ids) and cpumask_next()
> currently calls find_next_bit() with its input CPU ID number plus one
> for the bit number, giving cpumask_next() the range [-1, nr_cpu_ids - 1).
> seq_read_iter() and cpuinfo's start and next seq operations implement a
> pattern like
> 
>   n = cpumask_next(n - 1, mask);
>   show(n);
>   while (1) {
>   ++n;
>   n = cpumask_next(n - 1, mask);
>   if (n >= nr_cpu_ids)
>   break;
>   show(n);
>   }
> 
> which will eventually result in cpumask_next() being called with
> nr_cpu_ids - 1. A kernel compiled with commit 78e5a3399421 ("cpumask:
> fix checking valid cpu range"), but not its revert, commit
> 80493877d7d0 ("Revert "cpumask: fix checking valid cpu range"."),
> will generate a warning when DEBUG_PER_CPU_MAPS is enabled each time
> /proc/cpuinfo is read. Future-proof cpuinfo by checking its input to
> cpumask_next() is valid.
> 
> Signed-off-by: Andrew Jones 
> Cc: Yury Norov 

Reviewed-by: Yury Norov 

> ---
>  arch/x86/kernel/cpu/proc.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
> index 099b6f0d96bd..de3f93ac6e49 100644
> --- a/arch/x86/kernel/cpu/proc.c
> +++ b/arch/x86/kernel/cpu/proc.c
> @@ -153,6 +153,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
>  
>  static void *c_start(struct seq_file *m, loff_t *pos)
>  {
> + if (*pos == nr_cpu_ids)
> + return NULL;
> +
>   *pos = cpumask_next(*pos - 1, cpu_online_mask);
>   if ((*pos) < nr_cpu_ids)
>   return &cpu_data(*pos);
> -- 
> 2.37.3

linux-next: build failure on power pc

2021-01-20 Thread Yury Norov

Hi all,

I found the power pc build broken on today's
linux-next (647060f3b592).

My compiler is:

yury:linux$ powerpc-linux-gnu-gcc --version
powerpc-linux-gnu-gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

The config and error log are attached.

Thanks,
Yury


ppc.tar.gz
Description: application/gzip

Re: [PATCH] powerpc: fix AKEBONO build failures

2021-01-20 Thread Yury Norov

On Wed, Jan 20, 2021 at 10:10 PM Randy Dunlap  wrote:
>
> On 1/20/21 1:29 PM, Yury Norov wrote:
> > Hi all,
> >
> > I found the power pc build broken on today's
> > linux-next (647060f3b592).
>
> Darn, I was building linux-5.11-rc4.
>
> I'll try linux-next after I send this.
>
> ---
> From: Randy Dunlap 
>
> Fulfill AKEBONO Kconfig requirements.
>
> Fixes these Kconfig warnings (and more) and fixes the subsequent
> build errors:
>
> WARNING: unmet direct dependencies detected for NETDEVICES
>   Depends on [n]: NET [=n]
>   Selected by [y]:
>   - AKEBONO [=y] && PPC_47x [=y]
>
> WARNING: unmet direct dependencies detected for MMC_SDHCI
>   Depends on [n]: MMC [=n] && HAS_DMA [=y]
>   Selected by [y]:
>   - AKEBONO [=y] && PPC_47x [=y]
>
> Signed-off-by: Randy Dunlap 
> Cc: Michael Ellerman 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Yury Norov 
> ---
>  arch/powerpc/platforms/44x/Kconfig |2 ++
>  1 file changed, 2 insertions(+)
>
> --- lnx-511-rc4.orig/arch/powerpc/platforms/44x/Kconfig
> +++ lnx-511-rc4/arch/powerpc/platforms/44x/Kconfig
> @@ -206,6 +206,7 @@ config AKEBONO
> select PPC4xx_HSTA_MSI
> select I2C
> select I2C_IBM_IIC
> +   select NET
> select NETDEVICES
> select ETHERNET
> select NET_VENDOR_IBM
> @@ -213,6 +214,7 @@ config AKEBONO
> select USB if USB_SUPPORT
> select USB_OHCI_HCD_PLATFORM if USB_OHCI_HCD
> select USB_EHCI_HCD_PLATFORM if USB_EHCI_HCD
> +   select MMC
> select MMC_SDHCI
> select MMC_SDHCI_PLTFM
> select ATA

Looks working, thanks.

Tested-by: Yury Norov

[PATCH] powerpc: restore current_thread_info()

2019-05-07 Thread Yury Norov

Commit ed1cd6deb013 ("powerpc: Activate CONFIG_THREAD_INFO_IN_TASK")
removes the function current_thread_info(). It's wrong because the
function is used in non-arch code and is part of API.

For my series of arm64/ilp32, after applying the patch
https://github.com/norov/linux/commit/b269e51eee66ffec3008a3effb12363b91754e49
it causes build break.

This patch restores current_thread_info().

Signed-off-by: Yury Norov 
---
 arch/powerpc/include/asm/thread_info.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/thread_info.h 
b/arch/powerpc/include/asm/thread_info.h
index 8e1d0195ac36..f700bc80a607 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -19,6 +19,7 @@
 
 #ifndef __ASSEMBLY__
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -57,6 +58,11 @@ struct thread_info {
 #define THREAD_SIZE_ORDER  (THREAD_SHIFT - PAGE_SHIFT)
 
 /* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+   return (struct thread_info *)current;
+}
+
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct 
*src);
 
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.17.1

Re: [PATCH] powerpc: restore current_thread_info()

2019-05-07 Thread Yury Norov

On Tue, May 07, 2019 at 11:58:56PM +0100, Al Viro wrote:
> On Tue, May 07, 2019 at 03:51:21PM -0700, Yury Norov wrote:
> > Commit ed1cd6deb013 ("powerpc: Activate CONFIG_THREAD_INFO_IN_TASK")
> > removes the function current_thread_info(). It's wrong because the
> > function is used in non-arch code and is part of API.
> 
> In include/linux/thread_info.h:
> 
> #ifdef CONFIG_THREAD_INFO_IN_TASK
> /*
>  * For CONFIG_THREAD_INFO_IN_TASK kernels we need  for the
>  * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
>  * including  can cause a circular dependency on some 
> platforms.
>  */
> #include 
> #define current_thread_info() ((struct thread_info *)current)
> #endif

Ah, sorry. Then it might be my rebase issue. I was confused because Christophe
didn't remove the comment to current_thread_info(), so I decided he
removed it erroneously.

[PATCH 2/9] lib/bitmap: implement bitmap_{empty, full} with bitmap_weight_eq()

2021-11-27 Thread Yury Norov

Now as we have bitmap_weight_eq(), switch bitmap_full() and
bitmap_empty() to using it.

Signed-off-by: Yury Norov 
---
 include/linux/bitmap.h | 26 ++
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 996041f771c8..2d951e4dc814 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -386,22 +386,6 @@ static inline int bitmap_subset(const unsigned long *src1,
return __bitmap_subset(src1, src2, nbits);
 }
 
-static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
-{
-   if (small_const_nbits(nbits))
-   return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
-
-   return find_first_bit(src, nbits) == nbits;
-}
-
-static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
-{
-   if (small_const_nbits(nbits))
-   return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
-
-   return find_first_zero_bit(src, nbits) == nbits;
-}
-
 static __always_inline int bitmap_weight(const unsigned long *src, unsigned 
int nbits)
 {
if (small_const_nbits(nbits))
@@ -436,6 +420,16 @@ static __always_inline bool bitmap_weight_le(const 
unsigned long *src,
return __bitmap_weight_le(src, nbits, num);
 }
 
+static __always_inline bool bitmap_empty(const unsigned long *src, unsigned 
int nbits)
+{
+   return bitmap_weight_eq(src, nbits, 0);
+}
+
+static __always_inline bool bitmap_full(const unsigned long *src, unsigned int 
nbits)
+{
+   return bitmap_weight_eq(src, nbits, nbits);
+}
+
 static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
unsigned int nbits)
 {
-- 
2.25.1

[PATCH 1/9] lib/bitmap: add bitmap_weight_{eq,gt,le}

2021-11-27 Thread Yury Norov

Many kernel users call bitmap_weight() to compare the result against
some number or expression:
if (bitmap_weight(...) > 1)
do_something();

It works OK, but may be significantly improved for large bitmaps: if
first few words count set bits to a number greater than given, we can
stop counting and immediately return.

The same idea would work in other direction: if we know that the number
of set bits that we counted so far is small enough, so that it would be
smaller than required number even if all bits of the rest of the bitmap
are set, we can return earlier.

This patch adds new bitmap_weight_{eq,gt,le} functions to allow this
optimization, and the following patches apply them where appropriate.

Signed-off-by: Yury Norov 
---
 include/linux/bitmap.h | 33 ++
 lib/bitmap.c   | 63 ++
 2 files changed, 96 insertions(+)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 7dba0847510c..996041f771c8 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -51,6 +51,9 @@ struct device;
  *  bitmap_empty(src, nbits)Are all bits zero in *src?
  *  bitmap_full(src, nbits) Are all bits set in *src?
  *  bitmap_weight(src, nbits)   Hamming Weight: number set bits
+ *  bitmap_weight_eq(src, nbits, num)   Hamming Weight is equal to num
+ *  bitmap_weight_gt(src, nbits, num)   Hamming Weight is greater than 
num
+ *  bitmap_weight_le(src, nbits, num)   Hamming Weight is less than num
  *  bitmap_set(dst, pos, nbits) Set specified bit area
  *  bitmap_clear(dst, pos, nbits)   Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
@@ -162,6 +165,9 @@ int __bitmap_intersects(const unsigned long *bitmap1,
 int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
 int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+bool __bitmap_weight_eq(const unsigned long *bitmap, unsigned int nbits, 
unsigned int num);
+bool __bitmap_weight_gt(const unsigned long *bitmap, unsigned int nbits, 
unsigned int num);
+bool __bitmap_weight_le(const unsigned long *bitmap, unsigned int nbits, 
unsigned int num);
 void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 
@@ -403,6 +409,33 @@ static __always_inline int bitmap_weight(const unsigned 
long *src, unsigned int
return __bitmap_weight(src, nbits);
 }
 
+static __always_inline bool bitmap_weight_eq(const unsigned long *src,
+   unsigned int nbits, unsigned int num)
+{
+   if (small_const_nbits(nbits))
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) == num;
+
+   return __bitmap_weight_eq(src, nbits, num);
+}
+
+static __always_inline bool bitmap_weight_gt(const unsigned long *src,
+   unsigned int nbits, unsigned int num)
+{
+   if (small_const_nbits(nbits))
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) > num;
+
+   return __bitmap_weight_gt(src, nbits, num);
+}
+
+static __always_inline bool bitmap_weight_le(const unsigned long *src,
+   unsigned int nbits, unsigned int num)
+{
+   if (small_const_nbits(nbits))
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) < num;
+
+   return __bitmap_weight_le(src, nbits, num);
+}
+
 static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
unsigned int nbits)
 {
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 926408883456..72e7ab2d7bdd 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -348,6 +348,69 @@ int __bitmap_weight(const unsigned long *bitmap, unsigned 
int bits)
 }
 EXPORT_SYMBOL(__bitmap_weight);
 
+bool __bitmap_weight_eq(const unsigned long *bitmap, unsigned int bits, 
unsigned int num)
+{
+   unsigned int k, w, lim = bits / BITS_PER_LONG;
+
+   for (k = 0, w = 0; k < lim; k++) {
+   if (w + bits - k * BITS_PER_LONG < num)
+   return false;
+
+   w += hweight_long(bitmap[k]);
+
+   if (w > num)
+   return false;
+   }
+
+   if (bits % BITS_PER_LONG)
+   w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+
+   return w == num;
+}
+EXPORT_SYMBOL(__bitmap_weight_eq);
+
+bool __bitmap_weight_gt(const unsigned long *bitmap, unsigned int bits, 
unsigned int num)
+{
+   unsigned int k, w, lim = bits / BITS_PER_LONG;
+
+   for (k = 0, w = 0; k < lim; k++) {
+   if (w + bits - k * BITS_PER_LONG <= num)
+   return false;
+
+   w += hweight_long(bitmap[k]);
+
+   if (w > num)
+

[PATCH 0/9] lib/bitmap: optimize bitmap_weight() usage

2021-11-27 Thread Yury Norov

In many cases people use bitmap_weight()-based functions like this:

if (num_present_cpus() > 1)
do_something();

This may take considerable amount of time on many-cpus machines because
num_present_cpus() will traverse every word of underlying cpumask
unconditionally.

We can significantly improve on it for many real cases if stop traversing
the mask as soon as we count present cpus to any number greater than 1:

if (num_present_cpus_gt(1))
do_something();

To implement this idea, the series adds bitmap_weight_{eq,gt,le}
functions together with corresponding wrappers in cpumask and nodemask.

Yury Norov (9):
  lib/bitmap: add bitmap_weight_{eq,gt,le}
  lib/bitmap: implement bitmap_{empty,full} with bitmap_weight_eq()
  all: replace bitmap_weigth() with bitmap_{empty,full,eq,gt,le}
  tools: sync bitmap_weight() usage with the kernel
  lib/cpumask: add cpumask_weight_{eq,gt,le}
  lib/nodemask: add nodemask_weight_{eq,gt,le}
  lib/cpumask: add num_{possible,present,active}_cpus_{eq,gt,le}
  lib/nodemask: add num_node_state_eq()
  MAINTAINERS: add cpumask and nodemask files to BITMAP_API

 MAINTAINERS   |  4 ++
 arch/alpha/kernel/process.c   |  2 +-
 arch/arc/kernel/smp.c |  2 +-
 arch/arm/kernel/machine_kexec.c   |  2 +-
 arch/arm/mach-exynos/exynos.c |  2 +-
 arch/arm/mm/cache-b15-rac.c   |  2 +-
 arch/arm64/kernel/smp.c   |  2 +-
 arch/arm64/mm/context.c   |  2 +-
 arch/csky/mm/asid.c   |  2 +-
 arch/csky/mm/context.c|  2 +-
 arch/ia64/kernel/setup.c  |  2 +-
 arch/ia64/mm/tlb.c|  8 +--
 arch/mips/cavium-octeon/octeon-irq.c  |  4 +-
 arch/mips/kernel/crash.c  |  2 +-
 arch/mips/kernel/i8253.c  |  2 +-
 arch/mips/kernel/perf_event_mipsxx.c  |  4 +-
 arch/mips/kernel/rtlx-cmp.c   |  2 +-
 arch/mips/kernel/smp.c|  4 +-
 arch/mips/kernel/vpe-cmp.c|  2 +-
 .../loongson2ef/common/cs5536/cs5536_mfgpt.c  |  2 +-
 arch/mips/mm/context.c|  2 +-
 arch/mips/mm/tlbex.c  |  2 +-
 arch/nds32/kernel/perf_event_cpu.c|  4 +-
 arch/nios2/kernel/cpuinfo.c   |  2 +-
 arch/powerpc/kernel/smp.c |  2 +-
 arch/powerpc/kernel/watchdog.c|  4 +-
 arch/powerpc/platforms/85xx/smp.c |  2 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |  4 +-
 arch/powerpc/sysdev/mpic.c|  2 +-
 arch/powerpc/xmon/xmon.c  | 10 +--
 arch/riscv/kvm/vmid.c |  2 +-
 arch/s390/kernel/perf_cpum_cf.c   |  2 +-
 arch/sparc/kernel/mdesc.c |  6 +-
 arch/x86/events/amd/core.c|  2 +-
 arch/x86/kernel/alternative.c |  8 +--
 arch/x86/kernel/apic/apic.c   |  4 +-
 arch/x86/kernel/apic/apic_flat_64.c   |  2 +-
 arch/x86/kernel/apic/probe_32.c   |  2 +-
 arch/x86/kernel/cpu/mce/dev-mcelog.c  |  2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c| 18 +++---
 arch/x86/kernel/hpet.c|  2 +-
 arch/x86/kernel/i8253.c   |  2 +-
 arch/x86/kernel/kvm.c |  2 +-
 arch/x86/kernel/kvmclock.c|  2 +-
 arch/x86/kernel/smpboot.c |  4 +-
 arch/x86/kernel/tsc.c |  2 +-
 arch/x86/kvm/hyperv.c |  8 +--
 arch/x86/mm/amdtopology.c |  2 +-
 arch/x86/mm/mmio-mod.c|  2 +-
 arch/x86/mm/numa_emulation.c  |  4 +-
 arch/x86/platform/uv/uv_nmi.c |  2 +-
 arch/x86/xen/smp_pv.c |  2 +-
 arch/x86/xen/spinlock.c   |  2 +-
 drivers/acpi/numa/srat.c  |  2 +-
 drivers/clk/samsung/clk-exynos4.c |  2 +-
 drivers/clocksource/ingenic-timer.c   |  3 +-
 drivers/cpufreq/pcc-cpufreq.c |  2 +-
 drivers/cpufreq/qcom-cpufreq-hw.c |  2 +-
 drivers/cpufreq/scmi-cpufreq.c|  2 +-
 drivers/crypto/ccp/ccp-dev-v5.c   |  5 +-
 drivers/dma/mv_xor.c  |  5 +-
 drivers/firmware/psci/psci_checker.c  |  2 +-
 drivers/gpu/drm/i810/i810_drv.c   |  2 +-
 drivers/gpu/drm/i915/i915_pmu.c   |  2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c  |  2 +-
 drivers/hv/channel_mgmt.c |  4 +-
 drivers/iio/adc/mxs-lradc-adc.c   |  3 +-
 drivers/iio/dummy/iio_simple_dummy_buffer.c   |  4 +-
 drivers/iio/industrialio-buffer.c |  2 +-
 drivers/iio/industrialio-trigge

[PATCH 3/9] all: replace bitmap_weigth() with bitmap_{empty, full, eq, gt, le}

2021-11-27 Thread Yury Norov

bitmap_weight() counts all set bits in the bitmap unconditionally.
However in some cases we can traverse a part of bitmap when we
only need to check if number of set bits is greater, less or equal
to some number.

This patch replaces bitmap_weight() with one of
bitmap_{empty,full,eq,gt,le), as appropriate.

In some places driver code has been optimized further, where it's
trivial.

Signed-off-by: Yury Norov 
---
 arch/nds32/kernel/perf_event_cpu.c |  4 +---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  4 ++--
 arch/x86/kvm/hyperv.c  |  8 
 drivers/crypto/ccp/ccp-dev-v5.c|  5 +
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c   |  2 +-
 drivers/iio/adc/mxs-lradc-adc.c|  3 +--
 drivers/iio/dummy/iio_simple_dummy_buffer.c|  4 ++--
 drivers/iio/industrialio-buffer.c  |  2 +-
 drivers/iio/industrialio-trigger.c |  2 +-
 drivers/memstick/core/ms_block.c   |  4 ++--
 drivers/net/dsa/b53/b53_common.c   |  2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c |  6 +-
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c   |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |  2 +-
 .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c  |  2 +-
 .../ethernet/marvell/octeontx2/nic/otx2_flows.c|  8 
 .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx4/cmd.c   | 10 +++---
 drivers/net/ethernet/mellanox/mlx4/eq.c|  4 ++--
 drivers/net/ethernet/mellanox/mlx4/main.c  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |  3 +--
 drivers/net/ethernet/qlogic/qed/qed_rdma.c |  4 ++--
 drivers/net/ethernet/qlogic/qed/qed_roce.c |  2 +-
 drivers/perf/arm-cci.c |  2 +-
 drivers/perf/arm_pmu.c |  4 ++--
 drivers/perf/hisilicon/hisi_uncore_pmu.c   |  2 +-
 drivers/perf/thunderx2_pmu.c   |  3 +--
 drivers/perf/xgene_pmu.c   |  2 +-
 drivers/pwm/pwm-pca9685.c  |  2 +-
 drivers/staging/media/tegra-video/vi.c |  2 +-
 drivers/thermal/intel/intel_powerclamp.c   | 10 --
 fs/ocfs2/cluster/heartbeat.c   | 14 +++---
 33 files changed, 57 insertions(+), 75 deletions(-)

diff --git a/arch/nds32/kernel/perf_event_cpu.c 
b/arch/nds32/kernel/perf_event_cpu.c
index a78a879e7ef1..05a1cd258356 100644
--- a/arch/nds32/kernel/perf_event_cpu.c
+++ b/arch/nds32/kernel/perf_event_cpu.c
@@ -695,10 +695,8 @@ static void nds32_pmu_enable(struct pmu *pmu)
 {
struct nds32_pmu *nds32_pmu = to_nds32_pmu(pmu);
struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events();
-   int enabled = bitmap_weight(hw_events->used_mask,
-   nds32_pmu->num_events);
 
-   if (enabled)
+   if (!bitmap_empty(hw_events->used_mask, nds32_pmu->num_events))
nds32_pmu->start(nds32_pmu);
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c 
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index b57b3db9a6a7..94e7e6b420e4 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2749,10 +2749,10 @@ static int __init_one_rdt_domain(struct rdt_domain *d, 
struct resctrl_schema *s,
cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
/*
 * Assign the u32 CBM to an unsigned long to ensure that
-* bitmap_weight() does not access out-of-bound memory.
+* bitmap_weight_le() does not access out-of-bound memory.
 */
tmp_cbm = cfg->new_ctrl;
-   if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
+   if (bitmap_weight_le(&tmp_cbm, r->cache.cbm_len, r->cache.min_cbm_bits) 
{
rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
return -ENOSPC;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5e19e6e4c2ce..8b72c896e0f1 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -90,7 +90,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic 
*synic,
 {
struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
-   int auto_eoi_old, auto_eoi_new;
+   bool auto_eoi_old, auto_eoi_new;
 
if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
return;
@@ -100,16 +100,16 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic 
*synic,
else
__clear_bit(vector, synic->vec_bitmap);
 
-   auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+   auto_eoi_old = bitmap_empty(synic->auto_eoi_bitmap, 256);

[PATCH 4/9] tools: sync bitmap_weight() usage with the kernel

2021-11-27 Thread Yury Norov

bitmap_weight() counts all set bits in the bitmap unconditionally.
However in some cases we can traverse a part of bitmap when we
only need to check if number of set bits is greater, less or equal
to some number.

This patch adds bitmap_weight_{eq,gt,le}, reimplements bitmap_{empty,full}
and replace bitmap_weight() where appropriate.

Signed-off-by: Yury Norov 
---
 tools/include/linux/bitmap.h | 42 +++--
 tools/lib/bitmap.c   | 60 
 tools/perf/builtin-c2c.c |  4 +--
 tools/perf/util/pmu.c|  2 +-
 4 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index ea97804d04d4..eb2831f7e5a7 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -12,6 +12,9 @@
unsigned long name[BITS_TO_LONGS(bits)]
 
 int __bitmap_weight(const unsigned long *bitmap, int bits);
+bool __bitmap_weight_eq(const unsigned long *bitmap, unsigned int nbits, 
unsigned int num);
+bool __bitmap_weight_gt(const unsigned long *bitmap, unsigned int nbits, 
unsigned int num);
+bool __bitmap_weight_le(const unsigned long *bitmap, unsigned int nbits, 
unsigned int num);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 const unsigned long *bitmap2, int bits);
 int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
@@ -45,27 +48,48 @@ static inline void bitmap_fill(unsigned long *dst, unsigned 
int nbits)
dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
 }
 
-static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
+static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
if (small_const_nbits(nbits))
-   return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
+   return __bitmap_weight(src, nbits);
+}
 
-   return find_first_bit(src, nbits) == nbits;
+static __always_inline bool bitmap_weight_eq(const unsigned long *src,
+   unsigned int nbits, unsigned int num)
+{
+   if (small_const_nbits(nbits))
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) == num;
+
+   return __bitmap_weight_eq(src, nbits, num);
 }
 
-static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
+static __always_inline bool bitmap_weight_gt(const unsigned long *src,
+   unsigned int nbits, unsigned int num)
 {
if (small_const_nbits(nbits))
-   return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) > num;
 
-   return find_first_zero_bit(src, nbits) == nbits;
+   return __bitmap_weight_gt(src, nbits, num);
 }
 
-static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static __always_inline bool bitmap_weight_le(const unsigned long *src,
+   unsigned int nbits, unsigned int num)
 {
if (small_const_nbits(nbits))
-   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
-   return __bitmap_weight(src, nbits);
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) < num;
+
+   return __bitmap_weight_le(src, nbits, num);
+}
+
+static __always_inline bool bitmap_empty(const unsigned long *src, unsigned 
int nbits)
+{
+   return bitmap_weight_eq(src, nbits, 0);
+}
+
+static __always_inline bool bitmap_full(const unsigned long *src, unsigned int 
nbits)
+{
+   return bitmap_weight_eq(src, nbits, nbits);
 }
 
 static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index db466ef7be9d..3aaf1767d237 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -18,6 +18,66 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
return w;
 }
 
+bool __bitmap_weight_eq(const unsigned long *bitmap, unsigned int bits, 
unsigned int num)
+{
+   unsigned int k, w, lim = bits / BITS_PER_LONG;
+
+   for (k = 0, w = 0; k < lim; k++) {
+   if (w + bits - k * BITS_PER_LONG < num)
+   return false;
+
+   w += hweight_long(bitmap[k]);
+
+   if (w > num)
+   return false;
+   }
+
+   if (bits % BITS_PER_LONG)
+   w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+
+   return w == num;
+}
+
+bool __bitmap_weight_gt(const unsigned long *bitmap, unsigned int bits, 
unsigned int num)
+{
+   unsigned int k, w, lim = bits / BITS_PER_LONG;
+
+   for (k = 0, w = 0; k < lim; k++) {
+   if (w + bits - k * BITS_PER_LONG <= num)
+   return false;
+
+   w += hweight_long(bitmap[k]);
+
+   if (w > num)
+   return true;
+

[PATCH 5/9] lib/cpumask: add cpumask_weight_{eq,gt,le}

2021-11-27 Thread Yury Norov

Add cpumask_weight_{eq,gt,le} and replace cpumask_weight() with one
of cpumask_weight_{empty,eq,gt,le} where appropriate. This allows
cpumask_weight_*() to return earlier depending on the condition.

Signed-off-by: Yury Norov 
---
 arch/alpha/kernel/process.c  |  2 +-
 arch/ia64/kernel/setup.c |  2 +-
 arch/ia64/mm/tlb.c   |  2 +-
 arch/mips/cavium-octeon/octeon-irq.c |  4 +--
 arch/mips/kernel/crash.c |  2 +-
 arch/powerpc/kernel/smp.c|  2 +-
 arch/powerpc/kernel/watchdog.c   |  4 +--
 arch/powerpc/xmon/xmon.c |  4 +--
 arch/s390/kernel/perf_cpum_cf.c  |  2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c   | 14 +--
 arch/x86/kernel/smpboot.c|  4 +--
 arch/x86/mm/mmio-mod.c   |  2 +-
 arch/x86/platform/uv/uv_nmi.c|  2 +-
 drivers/cpufreq/qcom-cpufreq-hw.c|  2 +-
 drivers/cpufreq/scmi-cpufreq.c   |  2 +-
 drivers/firmware/psci/psci_checker.c |  2 +-
 drivers/gpu/drm/i915/i915_pmu.c  |  2 +-
 drivers/hv/channel_mgmt.c|  4 +--
 drivers/infiniband/hw/hfi1/affinity.c| 13 +-
 drivers/infiniband/hw/qib/qib_file_ops.c |  2 +-
 drivers/infiniband/hw/qib/qib_iba7322.c  |  2 +-
 drivers/infiniband/sw/siw/siw_main.c |  3 +--
 drivers/irqchip/irq-bcm6345-l1.c |  2 +-
 drivers/scsi/lpfc/lpfc_init.c|  2 +-
 drivers/soc/fsl/qbman/qman_test_stash.c  |  2 +-
 include/linux/cpumask.h  | 32 
 kernel/irq/affinity.c|  2 +-
 kernel/padata.c  |  2 +-
 kernel/rcu/tree_nocb.h   |  4 +--
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/sched/core.c  | 10 
 kernel/sched/topology.c  |  4 +--
 kernel/time/clockevents.c|  2 +-
 kernel/time/clocksource.c|  2 +-
 mm/vmstat.c  |  4 +--
 35 files changed, 89 insertions(+), 59 deletions(-)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 5f8527081da9..0d4bc60828bf 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -125,7 +125,7 @@ common_shutdown_1(void *generic_ptr)
/* Wait for the secondaries to halt. */
set_cpu_present(boot_cpuid, false);
set_cpu_possible(boot_cpuid, false);
-   while (cpumask_weight(cpu_present_mask))
+   while (!cpumask_empty(cpu_present_mask))
barrier();
 #endif
 
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 5010348fa21b..fd6301eafa9d 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -572,7 +572,7 @@ setup_arch (char **cmdline_p)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
prefill_possible_map();
 #endif
-   per_cpu_scan_finalize((cpumask_weight(&early_cpu_possible_map) == 0 ?
+   per_cpu_scan_finalize((cpumask_empty(&early_cpu_possible_map) ?
32 : cpumask_weight(&early_cpu_possible_map)),
additional_cpus > 0 ? additional_cpus : 0);
 #endif /* CONFIG_ACPI_NUMA */
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index 135b5135cace..a5bce13ab047 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -332,7 +332,7 @@ __flush_tlb_range (struct vm_area_struct *vma, unsigned 
long start,
 
preempt_disable();
 #ifdef CONFIG_SMP
-   if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
+   if (mm != current->active_mm || !cpumask_weight_eq(mm_cpumask(mm), 1)) {
ia64_global_tlb_purge(mm, start, end, nbits);
preempt_enable();
return;
diff --git a/arch/mips/cavium-octeon/octeon-irq.c 
b/arch/mips/cavium-octeon/octeon-irq.c
index 844f882096e6..914871f15fb7 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -763,7 +763,7 @@ static void octeon_irq_cpu_offline_ciu(struct irq_data 
*data)
if (!cpumask_test_cpu(cpu, mask))
return;
 
-   if (cpumask_weight(mask) > 1) {
+   if (cpumask_weight_gt(mask, 1)) {
/*
 * It has multi CPU affinity, just remove this CPU
 * from the affinity set.
@@ -795,7 +795,7 @@ static int octeon_irq_ciu_set_affinity(struct irq_data 
*data,
 * This removes the need to do locking in the .ack/.eoi
 * functions.
 */
-   if (cpumask_weight(dest) != 1)
+   if (!cpumask_weight_eq(dest, 1))
return -EINVAL;
 
if (!enable_one)
diff --git a/arch/mips/kernel/crash.c b/arch/mips/kernel/crash.c
index 81845ba04835..4c35004754db 100644
--- a/arch/mips/kernel/crash.c
+++ b/arch/mips/kernel/crash.c
@@ -72,7 +72,7 @@ static void crash_kexec_prepare_cpus(void)
 */
pr_emerg("Sending IPI to other cpus...\n");
msecs = 100

[PATCH 6/9] lib/nodemask: add nodemask_weight_{eq,gt,le}

2021-11-27 Thread Yury Norov

Add nodemask_weight_{eq,gt,le} and replace nodemask_weight() where
appropriate. This allows nodemask_weight_*() to return earlier
depending on the condition.

Signed-off-by: Yury Norov 
---
 arch/x86/mm/amdtopology.c|  2 +-
 arch/x86/mm/numa_emulation.c |  4 ++--
 drivers/acpi/numa/srat.c |  2 +-
 include/linux/nodemask.h | 24 
 mm/mempolicy.c   |  2 +-
 5 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 058b2f36b3a6..b3ca7d23e4b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -154,7 +154,7 @@ int __init amd_numa_init(void)
node_set(nodeid, numa_nodes_parsed);
}
 
-   if (!nodes_weight(numa_nodes_parsed))
+   if (nodes_empty(numa_nodes_parsed))
return -ENOENT;
 
/*
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 1a02b791d273..9a9305367fdd 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct 
numa_meminfo *ei,
 * Continue to fill physical nodes with fake nodes until there is no
 * memory left on any of them.
 */
-   while (nodes_weight(physnode_mask)) {
+   while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
@@ -270,7 +270,7 @@ static int __init 
split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
 * Fill physical nodes with fake nodes of size until there is no memory
 * left on any of them.
 */
-   while (nodes_weight(physnode_mask)) {
+   while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 66a0142dc78c..c4f80d2d85bf 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -67,7 +67,7 @@ int acpi_map_pxm_to_node(int pxm)
node = pxm_to_node_map[pxm];
 
if (node == NUMA_NO_NODE) {
-   if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
+   if (nodes_weight_gt(nodes_found_map, MAX_NUMNODES + 1))
return NUMA_NO_NODE;
node = first_unset_node(nodes_found_map);
__acpi_map_pxm_to_node(pxm, node);
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 567c3ddba2c4..3801ec5b06f4 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -38,6 +38,9 @@
  * int nodes_empty(mask)   Is mask empty (no bits sets)?
  * int nodes_full(mask)Is mask full (all bits sets)?
  * int nodes_weight(mask)  Hamming weight - number of set bits
+ * bool nodes_weight_eq(src, nbits, num) Hamming Weight is equal to num
+ * bool nodes_weight_gt(src, nbits, num) Hamming Weight is greater than num
+ * bool nodes_weight_le(src, nbits, num) Hamming Weight is less than num
  *
  * void nodes_shift_right(dst, src, n) Shift right
  * void nodes_shift_left(dst, src, n)  Shift left
@@ -240,6 +243,27 @@ static inline int __nodes_weight(const nodemask_t *srcp, 
unsigned int nbits)
return bitmap_weight(srcp->bits, nbits);
 }
 
+#define nodes_weight_eq(nodemask, num) __nodes_weight_eq(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_eq(const nodemask_t *srcp,
+   unsigned int nbits, unsigned int num)
+{
+   return bitmap_weight_eq(srcp->bits, nbits, num);
+}
+
+#define nodes_weight_gt(nodemask, num) __nodes_weight_gt(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_gt(const nodemask_t *srcp,
+   unsigned int nbits, unsigned int num)
+{
+   return bitmap_weight_gt(srcp->bits, nbits, num);
+}
+
+#define nodes_weight_le(nodemask, num) __nodes_weight_le(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_le(const nodemask_t *srcp,
+   unsigned int nbits, unsigned int num)
+{
+   return bitmap_weight_le(srcp->bits, nbits, num);
+}
+
 #define nodes_shift_right(dst, src, n) \
__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
 static inline void __nodes_shift_right(nodemask_t *dstp,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b1fcdb4d25d6..4a48ce5b86cf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1154,7 +1154,7 @@ int do_migrate_pages(struct mm_struct *mm, const 
nodemask_t *from,
 *  [0-7] - > [3,4,5] moves only 0,1,2,6,7.
 */
 
-   if ((nodes_weight(*from) != nodes_weig

[PATCH 7/9] lib/cpumask: add num_{possible, present, active}_cpus_{eq, gt, le}

2021-11-27 Thread Yury Norov

Add num_{possible,present,active}_cpus_{eq,gt,le} and replace num_*_cpus()
with one of new functions where appropriate. This allows num_*_cpus_*()
to return earlier depending on the condition.

Signed-off-by: Yury Norov 
---
 arch/arc/kernel/smp.c |  2 +-
 arch/arm/kernel/machine_kexec.c   |  2 +-
 arch/arm/mach-exynos/exynos.c |  2 +-
 arch/arm/mm/cache-b15-rac.c   |  2 +-
 arch/arm64/kernel/smp.c   |  2 +-
 arch/arm64/mm/context.c   |  2 +-
 arch/csky/mm/asid.c   |  2 +-
 arch/csky/mm/context.c|  2 +-
 arch/ia64/mm/tlb.c|  6 ++---
 arch/mips/kernel/i8253.c  |  2 +-
 arch/mips/kernel/perf_event_mipsxx.c  |  4 ++--
 arch/mips/kernel/rtlx-cmp.c   |  2 +-
 arch/mips/kernel/smp.c|  4 ++--
 arch/mips/kernel/vpe-cmp.c|  2 +-
 .../loongson2ef/common/cs5536/cs5536_mfgpt.c  |  2 +-
 arch/mips/mm/context.c|  2 +-
 arch/mips/mm/tlbex.c  |  2 +-
 arch/nios2/kernel/cpuinfo.c   |  2 +-
 arch/powerpc/platforms/85xx/smp.c |  2 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |  4 ++--
 arch/powerpc/sysdev/mpic.c|  2 +-
 arch/powerpc/xmon/xmon.c  |  6 ++---
 arch/riscv/kvm/vmid.c |  2 +-
 arch/sparc/kernel/mdesc.c |  6 ++---
 arch/x86/events/amd/core.c|  2 +-
 arch/x86/kernel/alternative.c |  8 +++
 arch/x86/kernel/apic/apic.c   |  4 ++--
 arch/x86/kernel/apic/apic_flat_64.c   |  2 +-
 arch/x86/kernel/apic/probe_32.c   |  2 +-
 arch/x86/kernel/cpu/mce/dev-mcelog.c  |  2 +-
 arch/x86/kernel/hpet.c|  2 +-
 arch/x86/kernel/i8253.c   |  2 +-
 arch/x86/kernel/kvm.c |  2 +-
 arch/x86/kernel/kvmclock.c|  2 +-
 arch/x86/kernel/tsc.c |  2 +-
 arch/x86/xen/smp_pv.c |  2 +-
 arch/x86/xen/spinlock.c   |  2 +-
 drivers/clk/samsung/clk-exynos4.c |  2 +-
 drivers/clocksource/ingenic-timer.c   |  3 +--
 drivers/cpufreq/pcc-cpufreq.c |  2 +-
 drivers/dma/mv_xor.c  |  5 ++--
 drivers/gpu/drm/i810/i810_drv.c   |  2 +-
 drivers/irqchip/irq-gic.c |  2 +-
 drivers/net/caif/caif_virtio.c|  2 +-
 .../cavium/liquidio/cn23xx_vf_device.c|  2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |  2 +-
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   |  2 +-
 drivers/net/wireless/ath/ath9k/hw.c   |  2 +-
 drivers/net/wireless/marvell/mwifiex/main.c   |  4 ++--
 drivers/net/wireless/st/cw1200/queue.c|  3 +--
 drivers/nvdimm/region.c   |  2 +-
 drivers/nvme/host/pci.c   |  2 +-
 drivers/perf/arm_pmu.c|  2 +-
 .../intel/speed_select_if/isst_if_common.c|  6 ++---
 drivers/soc/bcm/brcmstb/biuctrl.c |  2 +-
 drivers/soc/fsl/dpio/dpio-service.c   |  4 ++--
 drivers/spi/spi-dw-bt1.c  |  2 +-
 drivers/virt/acrn/hsm.c   |  2 +-
 fs/xfs/xfs_sysfs.c|  2 +-
 include/linux/cpumask.h   | 23 +++
 include/linux/kdb.h   |  2 +-
 kernel/debug/kdb/kdb_bt.c |  2 +-
 kernel/printk/printk.c|  2 +-
 kernel/reboot.c   |  4 ++--
 kernel/time/clockevents.c |  2 +-
 mm/percpu.c   |  6 ++---
 mm/slab.c |  2 +-
 67 files changed, 110 insertions(+), 90 deletions(-)

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 78e6d069b1c1..d4f2765755c9 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -103,7 +103,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 * if platform didn't set the present map already, do it now
 * boot cpu is set to present already by init/main.c
 */
-   if (num_present_cpus() <= 1)
+   if (num_present_cpus_le(2))
init_cpu_present(cpu_possible_mask);
 }
 
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index f567032a09c0..8875e2ee0083 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -44,7 +44,7 @@ int machine_kexec_prepare(struct kimage *image)
 * and implements CPU hotplug for the current HW. If not, we won't be
 * able to kexec reliably, so fail the prepare operation.
 */
-   if (num_possib

[PATCH 8/9] lib/nodemask: add num_node_state_eq()

2021-11-27 Thread Yury Norov

Add num_node_state_eq() and replace num_node_state() with it in
page_alloc_init().

Signed-off-by: Yury Norov 
---
 include/linux/nodemask.h | 5 +
 mm/page_alloc.c  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 3801ec5b06f4..b68ee2a80164 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -455,6 +455,11 @@ static inline int num_node_state(enum node_states state)
return nodes_weight(node_states[state]);
 }
 
+static inline int num_node_state_eq(enum node_states state, unsigned int num)
+{
+   return nodes_weight_eq(node_states[state], num);
+}
+
 #define for_each_node_state(__node, __state) \
for_each_node_mask((__node), node_states[__state])
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 91c1105a9efe..81d55ffb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8323,7 +8323,7 @@ void __init page_alloc_init(void)
int ret;
 
 #ifdef CONFIG_NUMA
-   if (num_node_state(N_MEMORY) == 1)
+   if (num_node_state_eq(N_MEMORY, 1))
hashdist = 0;
 #endif
 
-- 
2.25.1

[PATCH 9/9] MAINTAINERS: add cpumask and nodemask files to BITMAP_API

2021-11-27 Thread Yury Norov

cpumask and nodemask APIs are thin wrappers around basic bitmap API, and
corresponding files are not formally maintained. This patch adds them to
BITMAP_API section, so that bitmap folks would have closer look at it.

Signed-off-by: Yury Norov 
---
 MAINTAINERS | 4 
 1 file changed, 4 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 00ad0cb5cb05..ceeffcd81fa4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3375,10 +3375,14 @@ R:  Andy Shevchenko 

 R: Rasmus Villemoes 
 S: Maintained
 F: include/linux/bitmap.h
+F: include/linux/cpumask.h
 F: include/linux/find.h
+F: include/linux/nodemask.h
 F: lib/bitmap.c
+F: lib/cpumask.c
 F: lib/find_bit.c
 F: lib/find_bit_benchmark.c
+F: lib/nodemask.c
 F: lib/test_bitmap.c
 F: tools/include/linux/bitmap.h
 F: tools/include/linux/find.h
-- 
2.25.1

Re: [PATCH 2/9] lib/bitmap: implement bitmap_{empty,full} with bitmap_weight_eq()

2021-11-27 Thread Yury Norov

On Sun, Nov 28, 2021 at 05:37:19AM +0100, Michał Mirosław wrote:
> On Sat, Nov 27, 2021 at 07:56:57PM -0800, Yury Norov wrote:
> > Now as we have bitmap_weight_eq(), switch bitmap_full() and
> > bitmap_empty() to using it.
> [...]
> > -static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
> > -{
> > -   if (small_const_nbits(nbits))
> > -   return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
> > -
> > -   return find_first_bit(src, nbits) == nbits;
> > -}
> [...]
> > +static __always_inline bool bitmap_empty(const unsigned long *src, 
> > unsigned int nbits)
> > +{
> > +   return bitmap_weight_eq(src, nbits, 0);
> > +}
> [..]
> 
> What's the speed difference? Have you benchmarked this?

bitmap_weight_eq() should be faster than find_first_bit(), but the
difference is few cycles, so I didn't bother measuring it.

New version looks just better.

Re: [PATCH 7/9] lib/cpumask: add num_{possible,present,active}_cpus_{eq,gt,le}

2021-11-27 Thread Yury Norov

(restore CC list)

On Sun, Nov 28, 2021 at 05:56:51AM +0100, Michał Mirosław wrote:
> On Sat, Nov 27, 2021 at 07:57:02PM -0800, Yury Norov wrote:
> > Add num_{possible,present,active}_cpus_{eq,gt,le} and replace num_*_cpus()
> > with one of new functions where appropriate. This allows num_*_cpus_*()
> > to return earlier depending on the condition.
> [...]
> > @@ -3193,7 +3193,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
> >  
> > /* allocate pages */
> > j = 0;
> > -   for (unit = 0; unit < num_possible_cpus(); unit++) {
> > +   for (unit = 0; num_possible_cpus_gt(unit); unit++) {
> 
> This looks dubious.

Only this?

> The old version I could hope the compiler would call
> num_possible_cpus() only once if it's marked const or pure, but the
> alternative is going to count the bits every time making this a guaranteed
> O(n^2) even though the bitmap doesn't change.

num_possible_cpus() is not const neither pure. This is O(n^2) before and after.

Re: [PATCH 7/9] lib/cpumask: add num_{possible,present,active}_cpus_{eq,gt,le}

2021-11-28 Thread Yury Norov

On Sun, Nov 28, 2021 at 09:07:52AM -0800, Joe Perches wrote:
> On Sat, 2021-11-27 at 19:57 -0800, Yury Norov wrote:
> > Add num_{possible,present,active}_cpus_{eq,gt,le} and replace num_*_cpus()
> > with one of new functions where appropriate. This allows num_*_cpus_*()
> > to return earlier depending on the condition.
> []
> > diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
> []
> > @@ -103,7 +103,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> >  * if platform didn't set the present map already, do it now
> >  * boot cpu is set to present already by init/main.c
> >  */
> > -   if (num_present_cpus() <= 1)
> > +   if (num_present_cpus_le(2))
> > init_cpu_present(cpu_possible_mask);
> 
> ?  is this supposed to be 2 or 1

X <= 1 is the equivalent of X < 2.

> > diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
> []
> > @@ -593,7 +593,7 @@ static int __init pcc_cpufreq_init(void)
> > return ret;
> > }
> >  
> > -   if (num_present_cpus() > 4) {
> > +   if (num_present_cpus_gt(4)) {
> > pcc_cpufreq_driver.flags |= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING;
> > pr_err("%s: Too many CPUs, dynamic performance scaling 
> > disabled\n",
> >__func__);
> 
> It looks as if the present variants should be using the same values
> so the _le test above with 1 changed to 2 looks odd.

Re: [PATCH 7/9] lib/cpumask: add num_{possible,present,active}_cpus_{eq,gt,le}

2021-11-28 Thread Yury Norov

On Sun, Nov 28, 2021 at 12:54:00PM -0500, Dennis Zhou wrote:
> Hello,
> 
> On Sun, Nov 28, 2021 at 09:43:20AM -0800, Yury Norov wrote:
> > On Sun, Nov 28, 2021 at 09:07:52AM -0800, Joe Perches wrote:
> > > On Sat, 2021-11-27 at 19:57 -0800, Yury Norov wrote:
> > > > Add num_{possible,present,active}_cpus_{eq,gt,le} and replace 
> > > > num_*_cpus()
> > > > with one of new functions where appropriate. This allows num_*_cpus_*()
> > > > to return earlier depending on the condition.
> > > []
> > > > diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
> > > []
> > > > @@ -103,7 +103,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
> > > >  * if platform didn't set the present map already, do it now
> > > >  * boot cpu is set to present already by init/main.c
> > > >  */
> > > > -   if (num_present_cpus() <= 1)
> > > > +   if (num_present_cpus_le(2))
> > > > init_cpu_present(cpu_possible_mask);
> > > 
> > > ?  is this supposed to be 2 or 1
> > 
> > X <= 1 is the equivalent of X < 2.
> > 
> > > > diff --git a/drivers/cpufreq/pcc-cpufreq.c 
> > > > b/drivers/cpufreq/pcc-cpufreq.c
> > > []
> > > > @@ -593,7 +593,7 @@ static int __init pcc_cpufreq_init(void)
> > > > return ret;
> > > > }
> > > >  
> > > > -   if (num_present_cpus() > 4) {
> > > > +   if (num_present_cpus_gt(4)) {
> > > > pcc_cpufreq_driver.flags |= 
> > > > CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING;
> > > > pr_err("%s: Too many CPUs, dynamic performance scaling 
> > > > disabled\n",
> > > >__func__);
> > > 
> > > It looks as if the present variants should be using the same values
> > > so the _le test above with 1 changed to 2 looks odd.
> >  
> 
> I think the confusion comes from le meaning less than rather than lt.
> Given the general convention of: lt (<), le (<=), eg (=), ge (>=),
> gt (>), I'd consider renaming your le to lt.

Ok, makes sense. I'll rename in v2 and add <= and >= versions.

Re: [PATCH 0/9] lib/bitmap: optimize bitmap_weight() usage

2021-11-28 Thread Yury Norov

On Sun, Nov 28, 2021 at 09:08:41PM +1000, Nicholas Piggin wrote:
> Excerpts from Yury Norov's message of November 28, 2021 1:56 pm:
> > In many cases people use bitmap_weight()-based functions like this:
> > 
> > if (num_present_cpus() > 1)
> > do_something();
> > 
> > This may take considerable amount of time on many-cpus machines because
> > num_present_cpus() will traverse every word of underlying cpumask
> > unconditionally.
> > 
> > We can significantly improve on it for many real cases if stop traversing
> > the mask as soon as we count present cpus to any number greater than 1:
> > 
> > if (num_present_cpus_gt(1))
> > do_something();
> > 
> > To implement this idea, the series adds bitmap_weight_{eq,gt,le}
> > functions together with corresponding wrappers in cpumask and nodemask.
> 
> There would be no change to callers if you maintain counters like what
> is done for num_online_cpus() today. Maybe some fixes to arch code that
> does not use set_cpu_possible() etc APIs required, but AFAIKS it would
> be better to fix such cases anyway.

Thanks, Nick. I'll try to do this.

Re: [PATCH 0/9] lib/bitmap: optimize bitmap_weight() usage

2021-11-28 Thread Yury Norov

On Sun, Nov 28, 2021 at 07:03:41PM +0100, mirq-t...@rere.qmqm.pl wrote:
> On Sat, Nov 27, 2021 at 07:56:55PM -0800, Yury Norov wrote:
> > In many cases people use bitmap_weight()-based functions like this:
> > 
> > if (num_present_cpus() > 1)
> > do_something();
> > 
> > This may take considerable amount of time on many-cpus machines because
> > num_present_cpus() will traverse every word of underlying cpumask
> > unconditionally.
> > 
> > We can significantly improve on it for many real cases if stop traversing
> > the mask as soon as we count present cpus to any number greater than 1:
> > 
> > if (num_present_cpus_gt(1))
> > do_something();
> > 
> > To implement this idea, the series adds bitmap_weight_{eq,gt,le}
> > functions together with corresponding wrappers in cpumask and nodemask.
> 
> Having slept on it I have more structured thoughts:
> 
> First, I like substituting bitmap_empty/full where possible - I think
> the change stands on its own, so could be split and sent as is.

Ok, I can do it.

> I don't like the proposed API very much. One problem is that it hides
> the comparison operator and makes call sites less readable:
> 
>   bitmap_weight(...) > N
> 
> becomes:
> 
>   bitmap_weight_gt(..., N)
> 
> and:
>   bitmap_weight(...) <= N
> 
> becomes:
> 
>   bitmap_weight_lt(..., N+1)
> or:
>   !bitmap_weight_gt(..., N)
> 
> I'd rather see something resembling memcmp() API that's known enough
> to be easier to grasp. For above examples:
> 
>   bitmap_weight_cmp(..., N) > 0
>   bitmap_weight_cmp(..., N) <= 0
>   ...

bitmap_weight_cmp() cannot be efficient. Consider this example:

bitmap_weight_lt(1000   , 1) == false
 ^
 stop here

bitmap_weight_cmp(1000   , 1) == 0
 ^
 stop here

I agree that '_gt' is less verbose than '>', but the advantage of 
'_gt' over '>' is proportional to length of bitmap, and it means
that this API should exist.

> This would also make the implementation easier in not having to
> copy and paste the code three times. Could also use a simple
> optimization reducing code size:

In the next version I'll reduce code duplication like this:

bool bitmap_eq(..., N);
bool bitmap_ge(..., N);

#define bitmap_weight_gt(..., N)  bitmap_weight_ge(..., N + 1)
#define bitmap_weight_lt(..., N) !bitmap_weight_ge(..., N)
#define bitmap_weight_le(..., N) !bitmap_weight_gt(..., N)

Thanks,
Yury

Re: [PATCH 0/9] lib/bitmap: optimize bitmap_weight() usage

2021-12-01 Thread Yury Norov

On Mon, Nov 29, 2021 at 04:34:07PM +, Michał Mirosław wrote:
> Dnia 29 listopada 2021 06:38:39 UTC, Yury Norov  
> napisał/a:
> >On Sun, Nov 28, 2021 at 07:03:41PM +0100, mirq-t...@rere.qmqm.pl wrote:
> >> On Sat, Nov 27, 2021 at 07:56:55PM -0800, Yury Norov wrote:
> >> > In many cases people use bitmap_weight()-based functions like this:
> >> > 
> >> >  if (num_present_cpus() > 1)
> >> >  do_something();
> >> > 
> >> > This may take considerable amount of time on many-cpus machines because
> >> > num_present_cpus() will traverse every word of underlying cpumask
> >> > unconditionally.
> >> > 
> >> > We can significantly improve on it for many real cases if stop traversing
> >> > the mask as soon as we count present cpus to any number greater than 1:
> >> > 
> >> >  if (num_present_cpus_gt(1))
> >> >  do_something();
> >> > 
> >> > To implement this idea, the series adds bitmap_weight_{eq,gt,le}
> >> > functions together with corresponding wrappers in cpumask and nodemask.
> >> 
> >> Having slept on it I have more structured thoughts:
> >> 
> >> First, I like substituting bitmap_empty/full where possible - I think
> >> the change stands on its own, so could be split and sent as is.
> >
> >Ok, I can do it.
> >
> >> I don't like the proposed API very much. One problem is that it hides
> >> the comparison operator and makes call sites less readable:
> >> 
> >>bitmap_weight(...) > N
> >> 
> >> becomes:
> >> 
> >>bitmap_weight_gt(..., N)
> >> 
> >> and:
> >>bitmap_weight(...) <= N
> >> 
> >> becomes:
> >> 
> >>bitmap_weight_lt(..., N+1)
> >> or:
> >>!bitmap_weight_gt(..., N)
> >> 
> >> I'd rather see something resembling memcmp() API that's known enough
> >> to be easier to grasp. For above examples:
> >> 
> >>bitmap_weight_cmp(..., N) > 0
> >>bitmap_weight_cmp(..., N) <= 0
> >>...
> >
> >bitmap_weight_cmp() cannot be efficient. Consider this example:
> >
> >bitmap_weight_lt(1000   , 1) == false
> > ^
> > stop here
> >
> >bitmap_weight_cmp(1000   , 1) == 0
> > ^
> > stop here
> >
> >I agree that '_gt' is less verbose than '>', but the advantage of 
> >'_gt' over '>' is proportional to length of bitmap, and it means
> >that this API should exist.
> 
> Thank you for the example. Indeed, for less-than to be efficient here you 
> would need to replace
>  bitmap_weight_cmp(..., N) < 0
> with
>  bitmap_weight_cmp(..., N-1) <= 0

Indeed, thanks for pointing to it.
 
> It would still be more readable, I think.

To be honest, I'm not sure that
bitmap_weight_cmp(..., N-1) <= 0
would be an obvious replacement for the original
bitmap_weight(...) < N
comparing to 
bitmap_weight_lt(..., N)

I think the best thing I can do is to add bitmap_weight_cmp() as
you suggested, and turn lt and others to be wrappers on it. This
will let people choose a better function in each case.

I also think that for v2 it would be better to drop the conversion
for short bitmaps, except for switching to bitmap_empty(), because
in that case readability wins over performance; if no objections. 

Thanks,
Yury

Re: [PATCH 2/9] lib/bitmap: implement bitmap_{empty, full} with bitmap_weight_eq()

2021-12-14 Thread Yury Norov

On Sun, Nov 28, 2021 at 10:10 AM Michał Mirosław
 wrote:
>
> On Sat, Nov 27, 2021 at 07:56:57PM -0800, Yury Norov wrote:
> > Now as we have bitmap_weight_eq(), switch bitmap_full() and
> > bitmap_empty() to using it.
> >
> > Signed-off-by: Yury Norov 
> > ---
> >  include/linux/bitmap.h | 26 ++
> >  1 file changed, 10 insertions(+), 16 deletions(-)
> >
> > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> > index 996041f771c8..2d951e4dc814 100644
> > --- a/include/linux/bitmap.h
> > +++ b/include/linux/bitmap.h
> > @@ -386,22 +386,6 @@ static inline int bitmap_subset(const unsigned long 
> > *src1,
> >   return __bitmap_subset(src1, src2, nbits);
> >  }
> >
> > -static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
> > -{
> > - if (small_const_nbits(nbits))
> > - return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
> > -
> > - return find_first_bit(src, nbits) == nbits;
> > -}
>
> Since this is supposed to be an optimization, I would go all the way and
> replace this with the trivial implementation instead:
>
> bool bitmap_empty(long *bits, size_t nbits)
> {
> for (; nbits >= BITS_PER_LONG; ++bits, nbits -= BITS_PER_LONG)
> if (*bits)
> return false;
>
> if (nbits && *bits & BITMAP_LAST_WORD_MASK(nbits))
> return false;
>
> return true;
> }

This is what current implementations basically do, based on find_first_bit().

I think that for long bitmaps the most time consuming operation is moving
data to L1, and for short bitmaps the difference between approaches is
barely measurable.

But hweght_long on each iteration can't be more effective than the current
version. So, I'll drop this patch for v2 and keep things unchanged.

Re: [PATCH 2/9] lib/bitmap: implement bitmap_{empty, full} with bitmap_weight_eq()

2021-12-16 Thread Yury Norov

On Wed, Dec 15, 2021 at 12:41 AM David Laight  wrote:
>
> From: Yury Norov
> > Sent: 14 December 2021 19:43
> ...
> >
> > I think that for long bitmaps the most time consuming operation is moving
> > data to L1, and for short bitmaps the difference between approaches is
> > barely measurable.
> >
> > But hweght_long on each iteration can't be more effective than the current
> > version. So, I'll drop this patch for v2 and keep things unchanged.
>
> Actually do bitmap_full/empty() calls make any sense at all?
> The result is stale since bitmaps are designed to do locked operations.
> If you have a lock covering the bitmap then you should be using
> something that uses non-locked accesses.
> Rightly or wrongly that isn't the bitmap api.

Are you talking about __{set,clear}_bit()?
include/asm-generic/bitops/non-atomic.h

[PATCH 01/17] all: don't use bitmap_weight() where possible

2021-12-18 Thread Yury Norov

Don't call bitmap_weight() if the following code can get by
without it.

Signed-off-by: Yury Norov 
---
 drivers/net/dsa/b53/b53_common.c   | 6 +-
 drivers/net/ethernet/broadcom/bcmsysport.c | 6 +-
 drivers/thermal/intel/intel_powerclamp.c   | 9 +++--
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 3867f3d4545f..9a10d80125d9 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1620,12 +1620,8 @@ static int b53_arl_read(struct b53_device *dev, u64 mac,
return 0;
}
 
-   if (bitmap_weight(free_bins, dev->num_arl_bins) == 0)
-   return -ENOSPC;
-
*idx = find_first_bit(free_bins, dev->num_arl_bins);
-
-   return -ENOENT;
+   return *idx >= dev->num_arl_bins ? -ENOSPC : -ENOENT;
 }
 
 static int b53_arl_op(struct b53_device *dev, int op, int port,
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 40933bf5a710..241696fdc6c7 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2177,13 +2177,9 @@ static int bcm_sysport_rule_set(struct bcm_sysport_priv 
*priv,
if (nfc->fs.ring_cookie != RX_CLS_FLOW_WAKE)
return -EOPNOTSUPP;
 
-   /* All filters are already in use, we cannot match more rules */
-   if (bitmap_weight(priv->filters, RXCHK_BRCM_TAG_MAX) ==
-   RXCHK_BRCM_TAG_MAX)
-   return -ENOSPC;
-
index = find_first_zero_bit(priv->filters, RXCHK_BRCM_TAG_MAX);
if (index >= RXCHK_BRCM_TAG_MAX)
+   /* All filters are already in use, we cannot match more rules */
return -ENOSPC;
 
/* Location is the classification ID, and index is the position
diff --git a/drivers/thermal/intel/intel_powerclamp.c 
b/drivers/thermal/intel/intel_powerclamp.c
index 14256421d98c..c841ab37e7c6 100644
--- a/drivers/thermal/intel/intel_powerclamp.c
+++ b/drivers/thermal/intel/intel_powerclamp.c
@@ -556,12 +556,9 @@ static void end_power_clamp(void)
 * stop faster.
 */
clamping = false;
-   if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
-   for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
-   pr_debug("clamping worker for cpu %d alive, destroy\n",
-i);
-   stop_power_clamp_worker(i);
-   }
+   for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
+   pr_debug("clamping worker for cpu %d alive, destroy\n", i);
+   stop_power_clamp_worker(i);
}
 }
 
-- 
2.30.2

[PATCH v2 00/17] lib/bitmap: optimize bitmap_weight() usage

2021-12-18 Thread Yury Norov

In many cases people use bitmap_weight()-based functions to compare
the result against a number of expression:

if (cpumask_weight(...) > 1)
do_something();

This may take considerable amount of time on many-cpus machines because
cpumask_weight(...) will traverse every word of underlying cpumask
unconditionally.

We can significantly improve on it for many real cases if stop traversing
the mask as soon as we count cpus to any number greater than 1:

if (cpumask_weight_gt(..., 1))
do_something();

To implement this idea, the series adds bitmap_weight_cmp() function
and bitmap_weight_{eq,gt,ge,lt,le} macros on top of it; corresponding
wrappers in cpumask and nodemask.

There are 3 cpumasks, for which weight is counted frequently: possible,
present and active. They all are read-mostly, and to optimize counting
number of set bits for them, this series adds atomic counters, similarly
to online cpumask.

v1: https://lkml.org/lkml/2021/11/27/339
v2:
  - add bitmap_weight_cmp();
  - fix bitmap_weight_le semantics and provide full set of {eq,gt,ge,lt,le}
as wrappers around bitmap_weight_cmp();
  - don't touch small bitmaps (less than 32 bits) - optimization works
only for large bitmaps;
  - move bitmap_weight() == 0 -> bitmap_empty() conversion to a separate
patch, ditto cpumask_weight() and nodes_weight;
  - add counters for possible, present and active cpus;
  - drop bitmap_empty() where possible;
  - various fixes around bit counting that spotted my eyes.

Yury Norov (17):
  all: don't use bitmap_weight() where possible
  drivers: rename num_*_cpus variables
  fix open-coded for_each_set_bit()
  all: replace bitmap_weight with bitmap_empty where appropriate
  all: replace cpumask_weight with cpumask_empty where appropriate
  all: replace nodes_weight with nodes_empty where appropriate
  lib/bitmap: add bitmap_weight_{cmp,eq,gt,ge,lt,le} functions
  all: replace bitmap_weight with bitmap_weight_{eq,gt,ge,lt,le} where
appropriate
  lib/cpumask: add cpumask_weight_{eq,gt,ge,lt,le}
  lib/nodemask: add nodemask_weight_{eq,gt,ge,lt,le}
  lib/nodemask: add num_node_state_eq()
  kernel/cpu.c: fix init_cpu_online
  kernel/cpu: add num_possible_cpus counter
  kernel/cpu: add num_present_cpu counter
  kernel/cpu: add num_active_cpu counter
  tools/bitmap: sync bitmap_weight
  MAINTAINERS: add cpumask and nodemask files to BITMAP_API

 MAINTAINERS   |   4 +
 arch/alpha/kernel/process.c   |   2 +-
 arch/ia64/kernel/setup.c  |   2 +-
 arch/ia64/mm/tlb.c|   2 +-
 arch/mips/cavium-octeon/octeon-irq.c  |   4 +-
 arch/mips/kernel/crash.c  |   2 +-
 arch/nds32/kernel/perf_event_cpu.c|   2 +-
 arch/powerpc/kernel/smp.c |   2 +-
 arch/powerpc/kernel/watchdog.c|   2 +-
 arch/powerpc/xmon/xmon.c  |   4 +-
 arch/s390/kernel/perf_cpum_cf.c   |   2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c|  16 +--
 arch/x86/kernel/smpboot.c |   4 +-
 arch/x86/kvm/hyperv.c |   8 +-
 arch/x86/mm/amdtopology.c |   2 +-
 arch/x86/mm/mmio-mod.c|   2 +-
 arch/x86/mm/numa_emulation.c  |   4 +-
 arch/x86/platform/uv/uv_nmi.c |   2 +-
 drivers/acpi/numa/srat.c  |   2 +-
 drivers/cpufreq/qcom-cpufreq-hw.c |   2 +-
 drivers/cpufreq/scmi-cpufreq.c|   2 +-
 drivers/firmware/psci/psci_checker.c  |   2 +-
 drivers/gpu/drm/i915/i915_pmu.c   |   2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c  |   2 +-
 drivers/hv/channel_mgmt.c |   4 +-
 drivers/iio/dummy/iio_simple_dummy_buffer.c   |   4 +-
 drivers/iio/industrialio-trigger.c|   2 +-
 drivers/infiniband/hw/hfi1/affinity.c |  13 +-
 drivers/infiniband/hw/qib/qib_file_ops.c  |   2 +-
 drivers/infiniband/hw/qib/qib_iba7322.c   |   2 +-
 drivers/irqchip/irq-bcm6345-l1.c  |   2 +-
 drivers/leds/trigger/ledtrig-cpu.c|   6 +-
 drivers/memstick/core/ms_block.c  |   4 +-
 drivers/net/dsa/b53/b53_common.c  |   6 +-
 drivers/net/ethernet/broadcom/bcmsysport.c|   6 +-
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  |   4 +-
 .../net/ethernet/intel/ixgbe/ixgbe_sriov.c|   2 +-
 .../marvell/octeontx2/nic/otx2_ethtool.c  |   2 +-
 .../marvell/octeontx2/nic/otx2_flows.c|   8 +-
 .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx4/cmd.c  |  33 ++---
 drivers/net/ethernet/mellanox/mlx4/eq.c   |   4 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c   |   4 +-
 drivers/net/ethernet/mellanox/mlx4/main.c |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_rdma.c|   4 +-
 drivers/net/ethernet/qlogic/qed/qed

[PATCH 02/17] drivers: rename num_*_cpus variables

2021-12-18 Thread Yury Norov

Some drivers declare num_active_cpus and num_present_cpus,
despite that kernel has macros with corresponding names in
linux/cpumask.h, and the drivers include cpumask.h

The following patches switch num_*_cpus() to real functions,
which causes build failures for the drivers.

Signed-off-by: Yury Norov 
---
 drivers/leds/trigger/ledtrig-cpu.c | 6 +++---
 drivers/scsi/storvsc_drv.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/leds/trigger/ledtrig-cpu.c 
b/drivers/leds/trigger/ledtrig-cpu.c
index 8af4f9bb9cde..767e9749ca41 100644
--- a/drivers/leds/trigger/ledtrig-cpu.c
+++ b/drivers/leds/trigger/ledtrig-cpu.c
@@ -39,7 +39,7 @@ struct led_trigger_cpu {
 static DEFINE_PER_CPU(struct led_trigger_cpu, cpu_trig);
 
 static struct led_trigger *trig_cpu_all;
-static atomic_t num_active_cpus = ATOMIC_INIT(0);
+static atomic_t _active_cpus = ATOMIC_INIT(0);
 
 /**
  * ledtrig_cpu - emit a CPU event as a trigger
@@ -79,8 +79,8 @@ void ledtrig_cpu(enum cpu_led_event ledevt)
 
/* Update trigger state */
trig->is_active = is_active;
-   atomic_add(is_active ? 1 : -1, &num_active_cpus);
-   active_cpus = atomic_read(&num_active_cpus);
+   atomic_add(is_active ? 1 : -1, &_active_cpus);
+   active_cpus = atomic_read(&_active_cpus);
total_cpus = num_present_cpus();
 
led_trigger_event(trig->_trig,
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 20595c0ba0ae..705dd4ebde98 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1950,7 +1950,7 @@ static int storvsc_probe(struct hv_device *device,
 {
int ret;
int num_cpus = num_online_cpus();
-   int num_present_cpus = num_present_cpus();
+   int present_cpus = num_present_cpus();
struct Scsi_Host *host;
struct hv_host_device *host_dev;
bool dev_is_ide = ((dev_id->driver_data == IDE_GUID) ? true : false);
@@ -2060,7 +2060,7 @@ static int storvsc_probe(struct hv_device *device,
 * Set the number of HW queues we are supporting.
 */
if (!dev_is_ide) {
-   if (storvsc_max_hw_queues > num_present_cpus) {
+   if (storvsc_max_hw_queues > present_cpus) {
storvsc_max_hw_queues = 0;
storvsc_log(device, STORVSC_LOGGING_WARN,
"Resetting invalid storvsc_max_hw_queues value 
to default.\n");
@@ -2068,7 +2068,7 @@ static int storvsc_probe(struct hv_device *device,
if (storvsc_max_hw_queues)
host->nr_hw_queues = storvsc_max_hw_queues;
else
-   host->nr_hw_queues = num_present_cpus;
+   host->nr_hw_queues = present_cpus;
}
 
/*
-- 
2.30.2

[PATCH 03/17] fix open-coded for_each_set_bit()

2021-12-18 Thread Yury Norov

Mellanox driver has an open-coded for_each_set_bit(). Fix it.

Signed-off-by: Yury Norov 
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index e10b7b04b894..c56d2194cbfc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1994,21 +1994,16 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
*dev, int port)
 
 static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 {
-   int port, err;
+   int p, port, err;
struct mlx4_vport_state *vp_admin;
struct mlx4_vport_oper_state *vp_oper;
struct mlx4_slave_state *slave_state =
&priv->mfunc.master.slave_state[slave];
struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
&priv->dev, slave);
-   int min_port = find_first_bit(actv_ports.ports,
- priv->dev.caps.num_ports) + 1;
-   int max_port = min_port - 1 +
-   bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
 
-   for (port = min_port; port <= max_port; port++) {
-   if (!test_bit(port - 1, actv_ports.ports))
-   continue;
+   for_each_set_bit(p, actv_ports.ports, priv->dev.caps.num_ports) {
+   port = p + 1;
priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
priv->mfunc.master.vf_admin[slave].enable_smi[port];
vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
@@ -2063,19 +2058,13 @@ static int mlx4_master_activate_admin_state(struct 
mlx4_priv *priv, int slave)
 
 static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int 
slave)
 {
-   int port;
+   int p, port;
struct mlx4_vport_oper_state *vp_oper;
struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
&priv->dev, slave);
-   int min_port = find_first_bit(actv_ports.ports,
- priv->dev.caps.num_ports) + 1;
-   int max_port = min_port - 1 +
-   bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
 
-
-   for (port = min_port; port <= max_port; port++) {
-   if (!test_bit(port - 1, actv_ports.ports))
-   continue;
+   for_each_set_bit(p, actv_ports.ports, priv->dev.caps.num_ports) {
+   port = p + 1;
priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
MLX4_VF_SMI_DISABLED;
vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
-- 
2.30.2

[PATCH 04/17] all: replace bitmap_weight with bitmap_empty where appropriate

2021-12-18 Thread Yury Norov

In many cases, kernel code calls bitmap_weight() to check if any bit of
a given bitmap is set. It's better to use bitmap_empty() in that case
because bitmap_empty() stops traversing the bitmap as soon as it finds
first set bit, while bitmap_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov 
---
 arch/nds32/kernel/perf_event_cpu.c  | 2 +-
 arch/x86/kvm/hyperv.c   | 8 
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c| 2 +-
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c| 4 ++--
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c | 4 ++--
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c| 2 +-
 drivers/net/ethernet/qlogic/qed/qed_rdma.c  | 4 ++--
 drivers/net/ethernet/qlogic/qed/qed_roce.c  | 2 +-
 drivers/perf/arm-cci.c  | 2 +-
 drivers/perf/arm_pmu.c  | 4 ++--
 drivers/perf/hisilicon/hisi_uncore_pmu.c| 2 +-
 drivers/perf/xgene_pmu.c| 2 +-
 tools/perf/builtin-c2c.c| 4 ++--
 13 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/arch/nds32/kernel/perf_event_cpu.c 
b/arch/nds32/kernel/perf_event_cpu.c
index a78a879e7ef1..ea44e9ecb5c7 100644
--- a/arch/nds32/kernel/perf_event_cpu.c
+++ b/arch/nds32/kernel/perf_event_cpu.c
@@ -695,7 +695,7 @@ static void nds32_pmu_enable(struct pmu *pmu)
 {
struct nds32_pmu *nds32_pmu = to_nds32_pmu(pmu);
struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events();
-   int enabled = bitmap_weight(hw_events->used_mask,
+   bool enabled = !bitmap_empty(hw_events->used_mask,
nds32_pmu->num_events);
 
if (enabled)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6e38a7d22e97..2c3400dea4b3 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -90,7 +90,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic 
*synic,
 {
struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
-   int auto_eoi_old, auto_eoi_new;
+   bool auto_eoi_old, auto_eoi_new;
 
if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
return;
@@ -100,16 +100,16 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic 
*synic,
else
__clear_bit(vector, synic->vec_bitmap);
 
-   auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+   auto_eoi_old = bitmap_empty(synic->auto_eoi_bitmap, 256);
 
if (synic_has_vector_auto_eoi(synic, vector))
__set_bit(vector, synic->auto_eoi_bitmap);
else
__clear_bit(vector, synic->auto_eoi_bitmap);
 
-   auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+   auto_eoi_new = bitmap_empty(synic->auto_eoi_bitmap, 256);
 
-   if (!!auto_eoi_old == !!auto_eoi_new)
+   if (auto_eoi_old == auto_eoi_new)
return;
 
down_write(&vcpu->kvm->arch.apicv_update_lock);
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c 
b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index d7fa2c49e741..56a3063545ec 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -68,7 +68,7 @@ static int smp_request_block(struct mdp5_smp *smp,
uint8_t reserved;
 
/* we shouldn't be requesting blocks for an in-use client: */
-   WARN_ON(bitmap_weight(cs, cnt) > 0);
+   WARN_ON(!bitmap_empty(cs, cnt));
 
reserved = smp->reserved[cid];
 
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c 
b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 61b2db3342ed..ac0fe04df2e0 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -267,8 +267,8 @@ ice_set_pfe_link(struct ice_vf *vf, struct 
virtchnl_pf_event *pfe,
  */
 static bool ice_vf_has_no_qs_ena(struct ice_vf *vf)
 {
-   return (!bitmap_weight(vf->rxq_ena, ICE_MAX_RSS_QS_PER_VF) &&
-   !bitmap_weight(vf->txq_ena, ICE_MAX_RSS_QS_PER_VF));
+   return (bitmap_empty(vf->rxq_ena, ICE_MAX_RSS_QS_PER_VF) &&
+   bitmap_empty(vf->txq_ena, ICE_MAX_RSS_QS_PER_VF));
 }
 
 /**
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c 
b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
index 77a13fb555fb..80b2d64b4136 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
@@ -353,7 +353,7 @@ int otx2_add_macfilter(struct net_device *netdev, const u8 
*mac)
 {
struct otx2_nic *pf = netdev_priv(netdev);
 
-   if (bitmap_weight(&pf->flow_cfg->dmacflt_bmap,
+   if (!bitmap_empty(&pf->flow_cfg

[PATCH 05/17] all: replace cpumask_weight with cpumask_empty where appropriate

2021-12-18 Thread Yury Norov

In many cases, kernel code calls cpumask_weight() to check if any bit of
a given cpumask is set. We can do it more efficiently with cpumask_empty()
because cpumask_empty() stops traversing the cpumask as soon as it finds
first set bit, while cpumask_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov 
---
 arch/alpha/kernel/process.c|  2 +-
 arch/ia64/kernel/setup.c   |  2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 14 +++---
 arch/x86/mm/mmio-mod.c |  2 +-
 arch/x86/platform/uv/uv_nmi.c  |  2 +-
 drivers/cpufreq/qcom-cpufreq-hw.c  |  2 +-
 drivers/cpufreq/scmi-cpufreq.c |  2 +-
 drivers/gpu/drm/i915/i915_pmu.c|  2 +-
 drivers/infiniband/hw/hfi1/affinity.c  |  4 ++--
 drivers/irqchip/irq-bcm6345-l1.c   |  2 +-
 kernel/irq/affinity.c  |  2 +-
 kernel/padata.c|  2 +-
 kernel/rcu/tree_nocb.h |  4 ++--
 kernel/rcu/tree_plugin.h   |  2 +-
 kernel/sched/core.c|  2 +-
 kernel/sched/topology.c|  2 +-
 kernel/time/clocksource.c  |  2 +-
 mm/vmstat.c|  4 ++--
 18 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index f4759e4ee4a9..a4415ad44982 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -125,7 +125,7 @@ common_shutdown_1(void *generic_ptr)
/* Wait for the secondaries to halt. */
set_cpu_present(boot_cpuid, false);
set_cpu_possible(boot_cpuid, false);
-   while (cpumask_weight(cpu_present_mask))
+   while (!cpumask_empty(cpu_present_mask))
barrier();
 #endif
 
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 5010348fa21b..fd6301eafa9d 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -572,7 +572,7 @@ setup_arch (char **cmdline_p)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
prefill_possible_map();
 #endif
-   per_cpu_scan_finalize((cpumask_weight(&early_cpu_possible_map) == 0 ?
+   per_cpu_scan_finalize((cpumask_empty(&early_cpu_possible_map) ?
32 : cpumask_weight(&early_cpu_possible_map)),
additional_cpus > 0 ? additional_cpus : 0);
 #endif /* CONFIG_ACPI_NUMA */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c 
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index b57b3db9a6a7..e23ff03290b8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, 
cpumask_var_t newmask,
 
/* Check whether cpus belong to parent ctrl group */
cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
-   if (cpumask_weight(tmpmask)) {
+   if (!cpumask_empty(tmpmask)) {
rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to 
parent\n");
return -EINVAL;
}
 
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
-   if (cpumask_weight(tmpmask)) {
+   if (!cpumask_empty(tmpmask)) {
/* Give any dropped cpus to parent rdtgroup */
cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
update_closid_rmid(tmpmask, prgrp);
@@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, 
cpumask_var_t newmask,
 * and update per-cpu rmid
 */
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-   if (cpumask_weight(tmpmask)) {
+   if (!cpumask_empty(tmpmask)) {
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
if (crgrp == rdtgrp)
@@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, 
cpumask_var_t newmask,
 
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
-   if (cpumask_weight(tmpmask)) {
+   if (!cpumask_empty(tmpmask)) {
/* Can't drop from default group */
if (rdtgrp == &rdtgroup_default) {
rdt_last_cmd_puts("Can't drop CPUs from default 
group\n");
@@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, 
cpumask_var_t newmask,
 * and update per-cpu closid/rmid.
 */
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
-   if (cpumask_weight(tmpmask)) {
+   if (!cpumask_empty(tmpmask)) {
list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
if (r == rdtgrp)
continue;
cpumask_and(tmpmask1, &r->cpu_mask,

[PATCH 06/17] all: replace nodes_weight with nodes_empty where appropriate

2021-12-18 Thread Yury Norov

Kernel code calls nodes_weight() to check if any bit of a given nodemask is
set. We can do it more efficiently with nodes_empty() because nodes_empty()
stops traversing the nodemask as soon as it finds first set bit, while
nodes_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov 
---
 arch/x86/mm/amdtopology.c| 2 +-
 arch/x86/mm/numa_emulation.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 058b2f36b3a6..b3ca7d23e4b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -154,7 +154,7 @@ int __init amd_numa_init(void)
node_set(nodeid, numa_nodes_parsed);
}
 
-   if (!nodes_weight(numa_nodes_parsed))
+   if (nodes_empty(numa_nodes_parsed))
return -ENOENT;
 
/*
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 1a02b791d273..9a9305367fdd 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct 
numa_meminfo *ei,
 * Continue to fill physical nodes with fake nodes until there is no
 * memory left on any of them.
 */
-   while (nodes_weight(physnode_mask)) {
+   while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
@@ -270,7 +270,7 @@ static int __init 
split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
 * Fill physical nodes with fake nodes of size until there is no memory
 * left on any of them.
 */
-   while (nodes_weight(physnode_mask)) {
+   while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
-- 
2.30.2

[PATCH 07/17] lib/bitmap: add bitmap_weight_{cmp, eq, gt, ge, lt, le} functions

2021-12-18 Thread Yury Norov

Many kernel users use bitmap_weight() to compare the result against
some number or expression:

if (bitmap_weight(...) > 1)
do_something();

It works OK, but may be significantly improved for large bitmaps: if
first few words count set bits to a number greater than given, we can
stop counting and immediately return.

The same idea would work in other direction: if we know that the number
of set bits that we counted so far is small enough, so that it would be
smaller than required number even if all bits of the rest of the bitmap
are set, we can stop counting earlier.

This patch adds new bitmap_weight_cmp() as suggested by Michał Mirosław
and a family of eq, gt, ge, lt and le wrappers to allow this optimization.
The following patches apply new functions where appropriate.

Suggested-by: "Michał Mirosław"  (for 
bitmap_weight_cmp)
Signed-off-by: Yury Norov 
---
 include/linux/bitmap.h | 80 ++
 lib/bitmap.c   | 21 +++
 2 files changed, 101 insertions(+)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 7dba0847510c..708e57b32362 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -51,6 +51,12 @@ struct device;
  *  bitmap_empty(src, nbits)Are all bits zero in *src?
  *  bitmap_full(src, nbits) Are all bits set in *src?
  *  bitmap_weight(src, nbits)   Hamming Weight: number set bits
+ *  bitmap_weight_cmp(src, nbits)   compare Hamming Weight with a 
number
+ *  bitmap_weight_eq(src, nbits, num)   Hamming Weight == num
+ *  bitmap_weight_gt(src, nbits, num)   Hamming Weight >  num
+ *  bitmap_weight_ge(src, nbits, num)   Hamming Weight >= num
+ *  bitmap_weight_lt(src, nbits, num)   Hamming Weight <  num
+ *  bitmap_weight_le(src, nbits, num)   Hamming Weight <= num
  *  bitmap_set(dst, pos, nbits) Set specified bit area
  *  bitmap_clear(dst, pos, nbits)   Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
@@ -162,6 +168,7 @@ int __bitmap_intersects(const unsigned long *bitmap1,
 int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
 int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+int __bitmap_weight_cmp(const unsigned long *bitmap, unsigned int bits, int 
num);
 void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 
@@ -403,6 +410,79 @@ static __always_inline int bitmap_weight(const unsigned 
long *src, unsigned int
return __bitmap_weight(src, nbits);
 }
 
+/**
+ * bitmap_weight_cmp - compares number of set bits in @src with @num.
+ * @src:   source bitmap
+ * @nbits: length of bitmap in bits
+ * @num:   number to compare with
+ *
+ * As opposite to bitmap_weight() this function doesn't necessarily
+ * traverse full bitmap and may return earlier.
+ *
+ * Returns zero if weight of @src is equal to @num;
+ *negative number if weight of @src is less than @num;
+ *positive number if weight of @src is greater than @num;
+ *
+ * NOTES
+ *
+ * Because number of set bits cannot decrease while counting, when user
+ * wants to know if the number of set bits in the bitmap is less than
+ * @num, calling
+ * bitmap_weight_cmp(..., @num) < 0
+ * is potentially less effective than
+ * bitmap_weight_cmp(..., @num - 1) <= 0
+ *
+ * Consider an example:
+ * bitmap_weight_cmp(1000   , 1) < 0
+ * ^
+ * stop here
+ *
+ * bitmap_weight_cmp(1000   , 0) <= 0
+ *  ^
+ *  stop here
+ */
+static __always_inline
+int bitmap_weight_cmp(const unsigned long *src, unsigned int nbits, int num)
+{
+   if (num > (int)nbits || num < 0)
+   return -num;
+
+   if (small_const_nbits(nbits))
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) - num;
+
+   return __bitmap_weight_cmp(src, nbits, num);
+}
+
+static __always_inline
+bool bitmap_weight_eq(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num) == 0;
+}
+
+static __always_inline
+bool bitmap_weight_gt(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num) > 0;
+}
+
+static __always_inline
+bool bitmap_weight_ge(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num - 1) > 0;
+}
+
+static __always_inline
+bool bitmap_weight_lt(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num - 1) <= 0;
+}
+
+static __always_inline
+bool bitmap_weight_le(const unsig

[PATCH 08/17] all: replace bitmap_weight with bitmap_weight_{eq, gt, ge, lt, le} where appropriate

2021-12-18 Thread Yury Norov

Kernel code calls bitmap_weight() to compare the weight of bitmap with
a given number. We can do it more efficiently with bitmap_weight_{eq, ...}
because conditional bitmap_weight may stop traversing the bitmap earlier,
as soon as condition is met.

This patch replaces bitmap_weight with conditional versions where possible,
except for small bitmaps which size is not configurable and  known at
constant time. In that case conditional version of bitmap_weight would not
benefit due to small_const_nbits() optimization; but readability may
suffer.

Signed-off-by: Yury Norov 
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  2 +-
 drivers/iio/dummy/iio_simple_dummy_buffer.c|  4 ++--
 drivers/iio/industrialio-trigger.c |  2 +-
 drivers/memstick/core/ms_block.c   |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |  2 +-
 .../net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c  |  2 +-
 .../net/ethernet/marvell/octeontx2/nic/otx2_flows.c|  4 ++--
 drivers/net/ethernet/mellanox/mlx4/cmd.c   | 10 +++---
 drivers/net/ethernet/mellanox/mlx4/eq.c|  4 ++--
 drivers/net/ethernet/mellanox/mlx4/fw.c|  4 ++--
 drivers/net/ethernet/mellanox/mlx4/main.c  |  2 +-
 drivers/perf/thunderx2_pmu.c   |  4 ++--
 drivers/staging/media/tegra-video/vi.c |  2 +-
 13 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c 
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index e23ff03290b8..9d42e592c1cf 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2752,7 +2752,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, 
struct resctrl_schema *s,
 * bitmap_weight() does not access out-of-bound memory.
 */
tmp_cbm = cfg->new_ctrl;
-   if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
+   if (bitmap_weight_lt(&tmp_cbm, r->cache.cbm_len, 
r->cache.min_cbm_bits)) {
rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
return -ENOSPC;
}
diff --git a/drivers/iio/dummy/iio_simple_dummy_buffer.c 
b/drivers/iio/dummy/iio_simple_dummy_buffer.c
index 59aa60d4ca37..cd2470ddf82b 100644
--- a/drivers/iio/dummy/iio_simple_dummy_buffer.c
+++ b/drivers/iio/dummy/iio_simple_dummy_buffer.c
@@ -72,8 +72,8 @@ static irqreturn_t iio_simple_dummy_trigger_h(int irq, void 
*p)
int i, j;
 
for (i = 0, j = 0;
-i < bitmap_weight(indio_dev->active_scan_mask,
-  indio_dev->masklength);
+bitmap_weight_gt(indio_dev->active_scan_mask,
+  indio_dev->masklength, i);
 i++, j++) {
j = find_next_bit(indio_dev->active_scan_mask,
  indio_dev->masklength, j);
diff --git a/drivers/iio/industrialio-trigger.c 
b/drivers/iio/industrialio-trigger.c
index f504ed351b3e..98c54022fecf 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -331,7 +331,7 @@ int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 {
struct iio_dev_opaque *iio_dev_opaque = 
to_iio_dev_opaque(pf->indio_dev);
bool no_other_users =
-   bitmap_weight(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER) == 
1;
+   bitmap_weight_eq(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER, 
1);
int ret = 0;
 
if (trig->ops && trig->ops->set_trigger_state && no_other_users) {
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 0cda6c6baefc..5cdd987e78f7 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -155,8 +155,8 @@ static int msb_validate_used_block_bitmap(struct msb_data 
*msb)
for (i = 0; i < msb->zone_count; i++)
total_free_blocks += msb->free_block_count[i];
 
-   if (msb->block_count - bitmap_weight(msb->used_blocks_bitmap,
-   msb->block_count) == total_free_blocks)
+   if (bitmap_weight_eq(msb->used_blocks_bitmap, msb->block_count,
+   msb->block_count - total_free_blocks))
return 0;
 
pr_err("BUG: free block counts don't match the bitmap");
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 214a38de3f41..35297d8a488b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -246,7 +246,7 @@ int ixgbe_disable_sriov(struct ixgbe_adapter *adapter)
 #endif
 
/* Disable VMDq flag so d

[PATCH 09/17] lib/cpumask: add cpumask_weight_{eq,gt,ge,lt,le}

2021-12-18 Thread Yury Norov

Kernel code calls cpumask_weight() to compare the weight of cpumask with
a given number. We can do it more efficiently with cpumask_weight_{eq, ...}
because conditional cpumask_weight may stop traversing the cpumask earlier,
as soon as condition is met.

Signed-off-by: Yury Norov 
---
 arch/ia64/mm/tlb.c   |  2 +-
 arch/mips/cavium-octeon/octeon-irq.c |  4 +-
 arch/mips/kernel/crash.c |  2 +-
 arch/powerpc/kernel/smp.c|  2 +-
 arch/powerpc/kernel/watchdog.c   |  2 +-
 arch/powerpc/xmon/xmon.c |  4 +-
 arch/s390/kernel/perf_cpum_cf.c  |  2 +-
 arch/x86/kernel/smpboot.c|  4 +-
 drivers/firmware/psci/psci_checker.c |  2 +-
 drivers/hv/channel_mgmt.c|  4 +-
 drivers/infiniband/hw/hfi1/affinity.c|  9 ++---
 drivers/infiniband/hw/qib/qib_file_ops.c |  2 +-
 drivers/infiniband/hw/qib/qib_iba7322.c  |  2 +-
 drivers/scsi/lpfc/lpfc_init.c|  2 +-
 drivers/soc/fsl/qbman/qman_test_stash.c  |  2 +-
 include/linux/cpumask.h  | 50 
 kernel/sched/core.c  |  8 ++--
 kernel/sched/topology.c  |  2 +-
 kernel/time/clockevents.c|  2 +-
 19 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index 135b5135cace..a5bce13ab047 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -332,7 +332,7 @@ __flush_tlb_range (struct vm_area_struct *vma, unsigned 
long start,
 
preempt_disable();
 #ifdef CONFIG_SMP
-   if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
+   if (mm != current->active_mm || !cpumask_weight_eq(mm_cpumask(mm), 1)) {
ia64_global_tlb_purge(mm, start, end, nbits);
preempt_enable();
return;
diff --git a/arch/mips/cavium-octeon/octeon-irq.c 
b/arch/mips/cavium-octeon/octeon-irq.c
index 844f882096e6..914871f15fb7 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -763,7 +763,7 @@ static void octeon_irq_cpu_offline_ciu(struct irq_data 
*data)
if (!cpumask_test_cpu(cpu, mask))
return;
 
-   if (cpumask_weight(mask) > 1) {
+   if (cpumask_weight_gt(mask, 1)) {
/*
 * It has multi CPU affinity, just remove this CPU
 * from the affinity set.
@@ -795,7 +795,7 @@ static int octeon_irq_ciu_set_affinity(struct irq_data 
*data,
 * This removes the need to do locking in the .ack/.eoi
 * functions.
 */
-   if (cpumask_weight(dest) != 1)
+   if (!cpumask_weight_eq(dest, 1))
return -EINVAL;
 
if (!enable_one)
diff --git a/arch/mips/kernel/crash.c b/arch/mips/kernel/crash.c
index 81845ba04835..5b690d52491f 100644
--- a/arch/mips/kernel/crash.c
+++ b/arch/mips/kernel/crash.c
@@ -72,7 +72,7 @@ static void crash_kexec_prepare_cpus(void)
 */
pr_emerg("Sending IPI to other cpus...\n");
msecs = 1;
-   while ((cpumask_weight(&cpus_in_crash) < ncpus) && (--msecs > 0)) {
+   while (cpumask_weight_lt(&cpus_in_crash, ncpus) && (--msecs > 0)) {
cpu_relax();
mdelay(1);
}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c338f9d8ab37..00da2064ddf3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1655,7 +1655,7 @@ void start_secondary(void *unused)
if (has_big_cores)
sibling_mask = cpu_smallcore_mask;
 
-   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   if (cpumask_weight_gt(mask, cpumask_weight(sibling_mask(cpu
shared_caches = true;
}
 
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index bfc27496fe7e..62937a077de7 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -483,7 +483,7 @@ static void start_watchdog(void *arg)
 
wd_smp_lock(&flags);
cpumask_set_cpu(cpu, &wd_cpus_enabled);
-   if (cpumask_weight(&wd_cpus_enabled) == 1) {
+   if (cpumask_weight_eq(&wd_cpus_enabled, 1)) {
cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
wd_smp_last_reset_tb = get_tb();
}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index f9ae0b398260..b9e9d0b20a7b 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -469,7 +469,7 @@ static bool wait_for_other_cpus(int ncpus)
 
/* We wait for 2s, which is a metric "little while" */
for (timeout = 2; timeout != 0; --timeout) {
-   if (cpumask_weight(&cpus_in_xmon) >= ncpus)
+   if (cpumask_weight_ge(&cpus_in_xmon, ncpus))
r

[PATCH 10/17] lib/nodemask: add nodemask_weight_{eq,gt,ge,lt,le}

2021-12-18 Thread Yury Norov

Kernel code calls nodes_weight() to compare the weight of nodemask with
a given number. We can do it more efficiently with nodes_weight_{eq, ...}
because conditional nodes_weight may stop traversing the nodemask earlier,
as soon as condition is met.

Signed-off-by: Yury Norov 
---
 drivers/acpi/numa/srat.c |  2 +-
 include/linux/nodemask.h | 35 +++
 mm/mempolicy.c   |  2 +-
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 66a0142dc78c..484b9307f8cc 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -67,7 +67,7 @@ int acpi_map_pxm_to_node(int pxm)
node = pxm_to_node_map[pxm];
 
if (node == NUMA_NO_NODE) {
-   if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
+   if (nodes_weight_ge(nodes_found_map, MAX_NUMNODES))
return NUMA_NO_NODE;
node = first_unset_node(nodes_found_map);
__acpi_map_pxm_to_node(pxm, node);
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 567c3ddba2c4..197598e075e9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -38,6 +38,11 @@
  * int nodes_empty(mask)   Is mask empty (no bits sets)?
  * int nodes_full(mask)Is mask full (all bits sets)?
  * int nodes_weight(mask)  Hamming weight - number of set bits
+ * bool nodes_weight_eq(src, nbits, num) Hamming Weight is equal to num
+ * bool nodes_weight_gt(src, nbits, num) Hamming Weight is greater than num
+ * bool nodes_weight_ge(src, nbits, num) Hamming Weight is greater than or 
equal to num
+ * bool nodes_weight_lt(src, nbits, num) Hamming Weight is less than num
+ * bool nodes_weight_le(src, nbits, num) Hamming Weight is less than or equal 
to num
  *
  * void nodes_shift_right(dst, src, n) Shift right
  * void nodes_shift_left(dst, src, n)  Shift left
@@ -240,6 +245,36 @@ static inline int __nodes_weight(const nodemask_t *srcp, 
unsigned int nbits)
return bitmap_weight(srcp->bits, nbits);
 }
 
+#define nodes_weight_eq(nodemask, num) __nodes_weight_eq(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_eq(const nodemask_t *srcp, unsigned int 
nbits, int num)
+{
+   return bitmap_weight_eq(srcp->bits, nbits, num);
+}
+
+#define nodes_weight_gt(nodemask, num) __nodes_weight_gt(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_gt(const nodemask_t *srcp, unsigned int 
nbits, int num)
+{
+   return bitmap_weight_gt(srcp->bits, nbits, num);
+}
+
+#define nodes_weight_ge(nodemask, num) __nodes_weight_ge(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_ge(const nodemask_t *srcp, unsigned int 
nbits, int num)
+{
+   return bitmap_weight_ge(srcp->bits, nbits, num);
+}
+
+#define nodes_weight_lt(nodemask, num) __nodes_weight_lt(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_lt(const nodemask_t *srcp, unsigned int 
nbits, int num)
+{
+   return bitmap_weight_lt(srcp->bits, nbits, num);
+}
+
+#define nodes_weight_le(nodemask, num) __nodes_weight_le(&(nodemask), 
MAX_NUMNODES, (num))
+static inline int __nodes_weight_le(const nodemask_t *srcp, unsigned int 
nbits, int num)
+{
+   return bitmap_weight_le(srcp->bits, nbits, num);
+}
+
 #define nodes_shift_right(dst, src, n) \
__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
 static inline void __nodes_shift_right(nodemask_t *dstp,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a86590b2507d..27817cf2f2a0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1157,7 +1157,7 @@ int do_migrate_pages(struct mm_struct *mm, const 
nodemask_t *from,
 *  [0-7] - > [3,4,5] moves only 0,1,2,6,7.
 */
 
-   if ((nodes_weight(*from) != nodes_weight(*to)) &&
+   if (!nodes_weight_eq(*from, nodes_weight(*to)) &&
(node_isset(s, *to)))
continue;
 
-- 
2.30.2

[PATCH 11/17] lib/nodemask: add num_node_state_eq()

2021-12-18 Thread Yury Norov

Kernel code calls num_node_state() to compare number of nodes with a given
number. The underlying code calls bitmap_weight(), and we can do it more
efficiently with num_node_state_eq because conditional nodes_weight may
stop traversing the nodemask earlier, as soon as condition is met.

Signed-off-by: Yury Norov 
---
 include/linux/nodemask.h | 5 +
 mm/page_alloc.c  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 197598e075e9..c5014dbf3cce 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -466,6 +466,11 @@ static inline int num_node_state(enum node_states state)
return nodes_weight(node_states[state]);
 }
 
+static inline int num_node_state_eq(enum node_states state, int num)
+{
+   return nodes_weight_eq(node_states[state], num);
+}
+
 #define for_each_node_state(__node, __state) \
for_each_node_mask((__node), node_states[__state])
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index edfd6c81af82..71f5652828b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8323,7 +8323,7 @@ void __init page_alloc_init(void)
int ret;
 
 #ifdef CONFIG_NUMA
-   if (num_node_state(N_MEMORY) == 1)
+   if (num_node_state_eq(N_MEMORY, 1))
hashdist = 0;
 #endif
 
-- 
2.30.2

[PATCH 12/17] kernel/cpu.c: fix init_cpu_online

2021-12-18 Thread Yury Norov

cpu_online_mask has an associate counter of online cpus, which should be
initialized in init_cpu_online()

Fixes: 0c09ab96fc82010 (cpu/hotplug: Cache number of online CPUs)
Signed-off-by: Yury Norov 
---
 kernel/cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35e..cd7605204d4d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2616,6 +2616,7 @@ void init_cpu_possible(const struct cpumask *src)
 void init_cpu_online(const struct cpumask *src)
 {
cpumask_copy(&__cpu_online_mask, src);
+   atomic_set(&__num_online_cpus, cpumask_weight(cpu_online_mask));
 }
 
 void set_cpu_online(unsigned int cpu, bool online)
-- 
2.30.2

[PATCH 13/17] kernel/cpu: add num_possible_cpus counter

2021-12-18 Thread Yury Norov

Similarly to the online cpus, the cpu_possible_mask is actively used
in the kernel. This patch adds a counter for possible cpus, so that
users that call num_possible_cpus() would know the result immediately,
instead of calling the bitmap_weight for the mask underlying.

Suggested-by: Nicholas Piggin 
Signed-off-by: Yury Norov 
---
 include/linux/cpumask.h | 30 --
 kernel/cpu.c| 22 ++
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1906e3225737..0be2504d8e4c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -99,6 +99,7 @@ extern struct cpumask __cpu_dying_mask;
 #define cpu_dying_mask((const struct cpumask *)&__cpu_dying_mask)
 
 extern atomic_t __num_online_cpus;
+extern atomic_t __num_possible_cpus;
 
 extern cpumask_t cpus_booted_once_mask;
 
@@ -870,19 +871,8 @@ void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
 
-static inline void reset_cpu_possible_mask(void)
-{
-   bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
-}
-
-static inline void
-set_cpu_possible(unsigned int cpu, bool possible)
-{
-   if (possible)
-   cpumask_set_cpu(cpu, &__cpu_possible_mask);
-   else
-   cpumask_clear_cpu(cpu, &__cpu_possible_mask);
-}
+void set_cpu_possible(unsigned int cpu, bool possible);
+void reset_cpu_possible_mask(void);
 
 static inline void
 set_cpu_present(unsigned int cpu, bool present)
@@ -962,7 +952,19 @@ static inline unsigned int num_online_cpus(void)
 {
return atomic_read(&__num_online_cpus);
 }
-#define num_possible_cpus()cpumask_weight(cpu_possible_mask)
+
+/**
+ * num_possible_cpus() - Read the number of possible CPUs
+ *
+ * Despite the fact that __num_possible_cpus is of type atomic_t, this
+ * interface gives only a momentary snapshot and is not protected against
+ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+ * region.
+ */
+static inline unsigned int num_possible_cpus(void)
+{
+   return atomic_read(&__num_possible_cpus);
+}
 #define num_present_cpus() cpumask_weight(cpu_present_mask)
 #define num_active_cpus()  cpumask_weight(cpu_active_mask)
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cd7605204d4d..a0a815911173 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2583,10 +2583,13 @@ EXPORT_SYMBOL(cpu_all_bits);
 #ifdef CONFIG_INIT_ALL_POSSIBLE
 struct cpumask __cpu_possible_mask __read_mostly
= {CPU_BITS_ALL};
+atomic_t __num_possible_cpus __read_mostly = ATOMIC_INIT(NR_CPUS);
 #else
 struct cpumask __cpu_possible_mask __read_mostly;
+atomic_t __num_possible_cpus __read_mostly;
 #endif
 EXPORT_SYMBOL(__cpu_possible_mask);
+EXPORT_SYMBOL(__num_possible_cpus);
 
 struct cpumask __cpu_online_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_online_mask);
@@ -2611,6 +2614,7 @@ void init_cpu_present(const struct cpumask *src)
 void init_cpu_possible(const struct cpumask *src)
 {
cpumask_copy(&__cpu_possible_mask, src);
+   atomic_set(&__num_possible_cpus, cpumask_weight(cpu_possible_mask));
 }
 
 void init_cpu_online(const struct cpumask *src)
@@ -2640,6 +2644,24 @@ void set_cpu_online(unsigned int cpu, bool online)
}
 }
 
+void reset_cpu_possible_mask(void)
+{
+   bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
+   atomic_set(&__num_possible_cpus, 0);
+}
+
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+   if (possible) {
+   if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
+   atomic_inc(&__num_possible_cpus);
+   } else {
+   if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
+   atomic_dec(&__num_possible_cpus);
+   }
+}
+EXPORT_SYMBOL(set_cpu_possible);
+
 /*
  * Activate the first processor.
  */
-- 
2.30.2

[PATCH 14/17] kernel/cpu: add num_present_cpu counter

2021-12-18 Thread Yury Norov

Similarly to the online cpus, the cpu_present_mask is actively used
in the kernel. This patch adds a counter for present cpus, so that
users that call num_present_cpus() would know the result immediately,
instead of calling the bitmap_weight for the mask.

Suggested-by: Nicholas Piggin 
Signed-off-by: Yury Norov 
---
 include/linux/cpumask.h | 25 +++--
 kernel/cpu.c| 16 
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 0be2504d8e4c..c2a9d15e2cbd 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -100,6 +100,7 @@ extern struct cpumask __cpu_dying_mask;
 
 extern atomic_t __num_online_cpus;
 extern atomic_t __num_possible_cpus;
+extern atomic_t __num_present_cpus;
 
 extern cpumask_t cpus_booted_once_mask;
 
@@ -873,15 +874,7 @@ void init_cpu_online(const struct cpumask *src);
 
 void set_cpu_possible(unsigned int cpu, bool possible);
 void reset_cpu_possible_mask(void);
-
-static inline void
-set_cpu_present(unsigned int cpu, bool present)
-{
-   if (present)
-   cpumask_set_cpu(cpu, &__cpu_present_mask);
-   else
-   cpumask_clear_cpu(cpu, &__cpu_present_mask);
-}
+void set_cpu_present(unsigned int cpu, bool present);
 
 void set_cpu_online(unsigned int cpu, bool online);
 
@@ -965,7 +958,19 @@ static inline unsigned int num_possible_cpus(void)
 {
return atomic_read(&__num_possible_cpus);
 }
-#define num_present_cpus() cpumask_weight(cpu_present_mask)
+
+/**
+ * num_present_cpus() - Read the number of present CPUs
+ *
+ * Despite the fact that __num_present_cpus is of type atomic_t, this
+ * interface gives only a momentary snapshot and is not protected against
+ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+ * region.
+ */
+static inline unsigned int num_present_cpus(void)
+{
+   return atomic_read(&__num_present_cpus);
+}
 #define num_active_cpus()  cpumask_weight(cpu_active_mask)
 
 static inline bool cpu_online(unsigned int cpu)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a0a815911173..1f7ea1bdde1a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2597,6 +2597,9 @@ EXPORT_SYMBOL(__cpu_online_mask);
 struct cpumask __cpu_present_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_present_mask);
 
+atomic_t __num_present_cpus __read_mostly;
+EXPORT_SYMBOL(__num_present_cpus);
+
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 
@@ -2609,6 +2612,7 @@ EXPORT_SYMBOL(__num_online_cpus);
 void init_cpu_present(const struct cpumask *src)
 {
cpumask_copy(&__cpu_present_mask, src);
+   atomic_set(&__num_present_cpus, cpumask_weight(cpu_present_mask));
 }
 
 void init_cpu_possible(const struct cpumask *src)
@@ -2662,6 +2666,18 @@ void set_cpu_possible(unsigned int cpu, bool possible)
 }
 EXPORT_SYMBOL(set_cpu_possible);
 
+void set_cpu_present(unsigned int cpu, bool present)
+{
+   if (present) {
+   if (!cpumask_test_and_set_cpu(cpu, &__cpu_present_mask))
+   atomic_inc(&__num_present_cpus);
+   } else {
+   if (cpumask_test_and_clear_cpu(cpu, &__cpu_present_mask))
+   atomic_dec(&__num_present_cpus);
+   }
+}
+EXPORT_SYMBOL(set_cpu_present);
+
 /*
  * Activate the first processor.
  */
-- 
2.30.2

[PATCH 15/17] kernel/cpu: add num_active_cpu counter

2021-12-18 Thread Yury Norov

Similarly to the online cpus, the cpu_active_mask is actively used
in the kernel. This patch adds a counter for active cpus, so that
users that call num_active_cpus() would know the result immediately,
instead of calling the bitmap_weight for the mask.

Suggested-by: Nicholas Piggin 
Signed-off-by: Yury Norov 
---
 include/linux/cpumask.h | 26 +++---
 kernel/cpu.c| 15 +++
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c2a9d15e2cbd..0add872898f8 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -101,6 +101,7 @@ extern struct cpumask __cpu_dying_mask;
 extern atomic_t __num_online_cpus;
 extern atomic_t __num_possible_cpus;
 extern atomic_t __num_present_cpus;
+extern atomic_t __num_active_cpus;
 
 extern cpumask_t cpus_booted_once_mask;
 
@@ -875,17 +876,8 @@ void init_cpu_online(const struct cpumask *src);
 void set_cpu_possible(unsigned int cpu, bool possible);
 void reset_cpu_possible_mask(void);
 void set_cpu_present(unsigned int cpu, bool present);
-
 void set_cpu_online(unsigned int cpu, bool online);
-
-static inline void
-set_cpu_active(unsigned int cpu, bool active)
-{
-   if (active)
-   cpumask_set_cpu(cpu, &__cpu_active_mask);
-   else
-   cpumask_clear_cpu(cpu, &__cpu_active_mask);
-}
+void set_cpu_active(unsigned int cpu, bool active);
 
 static inline void
 set_cpu_dying(unsigned int cpu, bool dying)
@@ -971,7 +963,19 @@ static inline unsigned int num_present_cpus(void)
 {
return atomic_read(&__num_present_cpus);
 }
-#define num_active_cpus()  cpumask_weight(cpu_active_mask)
+
+/**
+ * num_active_cpus() - Read the number of active CPUs
+ *
+ * Despite the fact that __num_active_cpus is of type atomic_t, this
+ * interface gives only a momentary snapshot and is not protected against
+ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+ * region.
+ */
+static inline unsigned int num_active_cpus(void)
+{
+   return atomic_read(&__num_active_cpus);
+}
 
 static inline bool cpu_online(unsigned int cpu)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1f7ea1bdde1a..62b411d88810 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2603,6 +2603,9 @@ EXPORT_SYMBOL(__num_present_cpus);
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 
+atomic_t __num_active_cpus __read_mostly;
+EXPORT_SYMBOL(__num_active_cpus);
+
 struct cpumask __cpu_dying_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_dying_mask);
 
@@ -2678,6 +2681,18 @@ void set_cpu_present(unsigned int cpu, bool present)
 }
 EXPORT_SYMBOL(set_cpu_present);
 
+void set_cpu_active(unsigned int cpu, bool active)
+{
+   if (active) {
+   if (!cpumask_test_and_set_cpu(cpu, &__cpu_active_mask))
+   atomic_inc(&__num_active_cpus);
+   } else {
+   if (cpumask_test_and_clear_cpu(cpu, &__cpu_active_mask))
+   atomic_dec(&__num_active_cpus);
+   }
+}
+EXPORT_SYMBOL(set_cpu_active);
+
 /*
  * Activate the first processor.
  */
-- 
2.30.2

[PATCH 16/17] tools/bitmap: sync bitmap_weight

2021-12-18 Thread Yury Norov

Pull bitmap_weight_{cmp,eq,gt,ge,lt,le} from mother kernel and
use where applicable.

Signed-off-by: Yury Norov 
---
 tools/include/linux/bitmap.h | 44 
 tools/lib/bitmap.c   | 20 
 tools/perf/util/pmu.c|  2 +-
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index ea97804d04d4..e8ae9a85d555 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -12,6 +12,8 @@
unsigned long name[BITS_TO_LONGS(bits)]
 
 int __bitmap_weight(const unsigned long *bitmap, int bits);
+int __bitmap_weight_cmp(const unsigned long *bitmap, unsigned int bits,
+unsigned int num);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 const unsigned long *bitmap2, int bits);
 int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
@@ -68,6 +70,48 @@ static inline int bitmap_weight(const unsigned long *src, 
unsigned int nbits)
return __bitmap_weight(src, nbits);
 }
 
+static __always_inline
+int bitmap_weight_cmp(const unsigned long *src, unsigned int nbits, int num)
+{
+   if (num > (int)nbits || num < 0)
+   return -num;
+
+   if (small_const_nbits(nbits))
+   return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)) - num;
+
+   return __bitmap_weight_cmp(src, nbits, num);
+}
+
+static __always_inline
+bool bitmap_weight_eq(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num) == 0;
+}
+
+static __always_inline
+bool bitmap_weight_gt(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num) > 0;
+}
+
+static __always_inline
+bool bitmap_weight_ge(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num - 1) > 0;
+}
+
+static __always_inline
+bool bitmap_weight_lt(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num - 1) <= 0;
+}
+
+static __always_inline
+bool bitmap_weight_le(const unsigned long *src, unsigned int nbits, int num)
+{
+   return bitmap_weight_cmp(src, nbits, num) <= 0;
+}
+
 static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
 const unsigned long *src2, unsigned int nbits)
 {
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index db466ef7be9d..06e58fee8523 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -18,6 +18,26 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
return w;
 }
 
+int __bitmap_weight_cmp(const unsigned long *bitmap, unsigned int bits, int 
num)
+{
+   unsigned int k, w, lim = bits / BITS_PER_LONG;
+
+   for (k = 0, w = 0; k < lim; k++) {
+   if (w + bits - k * BITS_PER_LONG < num)
+   goto out;
+
+   w += hweight_long(bitmap[k]);
+
+   if (w > num)
+   goto out;
+   }
+
+   if (bits % BITS_PER_LONG)
+   w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+out:
+   return w - num;
+}
+
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 const unsigned long *bitmap2, int bits)
 {
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 6ae58406f4fc..015ee1321c7c 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1314,7 +1314,7 @@ static int pmu_config_term(const char *pmu_name,
 */
if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) {
if (term->no_value &&
-   bitmap_weight(format->bits, PERF_PMU_FORMAT_BITS) > 1) {
+   bitmap_weight_gt(format->bits, PERF_PMU_FORMAT_BITS, 1)) {
if (err) {
parse_events_error__handle(err, term->err_val,
   strdup("no value assigned for term"),
-- 
2.30.2

[PATCH 17/17] MAINTAINERS: add cpumask and nodemask files to BITMAP_API

2021-12-18 Thread Yury Norov

cpumask and nodemask APIs are thin wrappers around basic bitmap API, and
corresponding files are not formally maintained. This patch adds them to
BITMAP_API section, so that bitmap folks would have closer look at it.

Signed-off-by: Yury Norov 
---
 MAINTAINERS | 4 
 1 file changed, 4 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5964e047bc04..ecd41988c871 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3392,10 +3392,14 @@ R:  Andy Shevchenko 

 R: Rasmus Villemoes 
 S: Maintained
 F: include/linux/bitmap.h
+F: include/linux/cpumask.h
 F: include/linux/find.h
+F: include/linux/nodemask.h
 F: lib/bitmap.c
+F: lib/cpumask.c
 F: lib/find_bit.c
 F: lib/find_bit_benchmark.c
+F: lib/nodemask.c
 F: lib/test_bitmap.c
 F: tools/include/linux/bitmap.h
 F: tools/include/linux/find.h
-- 
2.30.2

Re: [PATCH 01/17] all: don't use bitmap_weight() where possible

2021-12-18 Thread Yury Norov

On Sat, Dec 18, 2021 at 2:16 PM Michał Mirosław  wrote:
>
> On Sat, Dec 18, 2021 at 01:19:57PM -0800, Yury Norov wrote:
> > Don't call bitmap_weight() if the following code can get by
> > without it.
> >
> > Signed-off-by: Yury Norov 
> > ---
> >  drivers/net/dsa/b53/b53_common.c   | 6 +-
> >  drivers/net/ethernet/broadcom/bcmsysport.c | 6 +-
> >  drivers/thermal/intel/intel_powerclamp.c   | 9 +++--
> >  3 files changed, 5 insertions(+), 16 deletions(-)
> [...]
>
> Looks good,

Does it mean Acked-by, Reviewed-by, or something else?

> but I think this needs to be split per subsystem.

What you ask breaks rules:

Documentation/process/submitting-patches.rst:

Separate each **logical change** into a separate patch.

For example, if your changes include both bug fixes and performance
enhancements for a single driver, separate those changes into two
or more patches.  If your changes include an API update, and a new
driver which uses that new API, separate those into two patches.

On the other hand, if you make a single change to numerous files,
group those changes into a single patch.  Thus a single logical change
is contained within a single patch.

This is not a dead rule, refer for example the 96d4f267e40f9 ("Remove
'type' argument from access_ok() functioin.")

Or this: https://lkml.org/lkml/2021/6/14/1736

Thanks,
Yury

[PATCH 39/54] arch/powerpc: replace cpumask_weight with cpumask_weight_{eq, ...} where appropriate

2022-01-23 Thread Yury Norov

PowerPC code uses cpumask_weight() to compare the weight of cpumask
with a given number. We can do it more efficiently with
cpumask_weight_{eq, ...} because conditional cpumask_weight may stop
traversing the cpumask earlier, as soon as condition is met.

Signed-off-by: Yury Norov 
---
 arch/powerpc/kernel/smp.c  | 2 +-
 arch/powerpc/kernel/watchdog.c | 2 +-
 arch/powerpc/xmon/xmon.c   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b7fd6a72aa76..8bff748df402 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1656,7 +1656,7 @@ void start_secondary(void *unused)
if (has_big_cores)
sibling_mask = cpu_smallcore_mask;
 
-   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   if (cpumask_weight_gt(mask, cpumask_weight(sibling_mask(cpu
shared_caches = true;
}
 
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index bfc27496fe7e..62937a077de7 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -483,7 +483,7 @@ static void start_watchdog(void *arg)
 
wd_smp_lock(&flags);
cpumask_set_cpu(cpu, &wd_cpus_enabled);
-   if (cpumask_weight(&wd_cpus_enabled) == 1) {
+   if (cpumask_weight_eq(&wd_cpus_enabled, 1)) {
cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
wd_smp_last_reset_tb = get_tb();
}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index fd72753e8ad5..b423812e94e0 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -469,7 +469,7 @@ static bool wait_for_other_cpus(int ncpus)
 
/* We wait for 2s, which is a metric "little while" */
for (timeout = 2; timeout != 0; --timeout) {
-   if (cpumask_weight(&cpus_in_xmon) >= ncpus)
+   if (cpumask_weight_ge(&cpus_in_xmon, ncpus))
return true;
udelay(100);
barrier();
@@ -1338,7 +1338,7 @@ static int cpu_cmd(void)
case 'S':
case 't':
cpumask_copy(&xmon_batch_cpus, &cpus_in_xmon);
-   if (cpumask_weight(&xmon_batch_cpus) <= 1) {
+   if (cpumask_weight_le(&xmon_batch_cpus, 1)) {
printf("There are no other cpus in 
xmon\n");
break;
}
-- 
2.30.2

[PATCH 46/54] soc: replace cpumask_weight with cpumask_weight_lt

2022-01-23 Thread Yury Norov

qman_test_stash() calls cpumask_weight() to compare the weight of
cpumask with a given number. We can do it more efficiently with
cpumask_weight_lt because conditional cpumask_weight may stop
traversing the cpumask earlier, as soon as condition is met.

Signed-off-by: Yury Norov 
---
 drivers/soc/fsl/qbman/qman_test_stash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qbman/qman_test_stash.c 
b/drivers/soc/fsl/qbman/qman_test_stash.c
index b7e8e5ec884c..28b08568a349 100644
--- a/drivers/soc/fsl/qbman/qman_test_stash.c
+++ b/drivers/soc/fsl/qbman/qman_test_stash.c
@@ -561,7 +561,7 @@ int qman_test_stash(void)
 {
int err;
 
-   if (cpumask_weight(cpu_online_mask) < 2) {
+   if (cpumask_weight_lt(cpu_online_mask, 2)) {
pr_info("%s(): skip - only 1 CPU\n", __func__);
return 0;
}
-- 
2.30.2

[PATCH 38/49] arch/powerpc: replace cpumask_weight with cpumask_weight_{eq, ...} where appropriate

2022-02-10 Thread Yury Norov

PowerPC code uses cpumask_weight() to compare the weight of cpumask with
a given number. We can do it more efficiently with cpumask_weight_{eq, ...}
because conditional cpumask_weight may stop traversing the cpumask earlier,
as soon as condition is (or can't be)  met.

Signed-off-by: Yury Norov 
---
 arch/powerpc/kernel/smp.c  | 2 +-
 arch/powerpc/kernel/watchdog.c | 2 +-
 arch/powerpc/xmon/xmon.c   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b7fd6a72aa76..8bff748df402 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1656,7 +1656,7 @@ void start_secondary(void *unused)
if (has_big_cores)
sibling_mask = cpu_smallcore_mask;
 
-   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   if (cpumask_weight_gt(mask, cpumask_weight(sibling_mask(cpu
shared_caches = true;
}
 
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index bfc27496fe7e..62937a077de7 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -483,7 +483,7 @@ static void start_watchdog(void *arg)
 
wd_smp_lock(&flags);
cpumask_set_cpu(cpu, &wd_cpus_enabled);
-   if (cpumask_weight(&wd_cpus_enabled) == 1) {
+   if (cpumask_weight_eq(&wd_cpus_enabled, 1)) {
cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
wd_smp_last_reset_tb = get_tb();
}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index fd72753e8ad5..b423812e94e0 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -469,7 +469,7 @@ static bool wait_for_other_cpus(int ncpus)
 
/* We wait for 2s, which is a metric "little while" */
for (timeout = 2; timeout != 0; --timeout) {
-   if (cpumask_weight(&cpus_in_xmon) >= ncpus)
+   if (cpumask_weight_ge(&cpus_in_xmon, ncpus))
return true;
udelay(100);
barrier();
@@ -1338,7 +1338,7 @@ static int cpu_cmd(void)
case 'S':
case 't':
cpumask_copy(&xmon_batch_cpus, &cpus_in_xmon);
-   if (cpumask_weight(&xmon_batch_cpus) <= 1) {
+   if (cpumask_weight_le(&xmon_batch_cpus, 1)) {
printf("There are no other cpus in 
xmon\n");
break;
}
-- 
2.32.0

[PATCH 43/49] soc/qman: replace cpumask_weight with cpumask_weight_lt

2022-02-10 Thread Yury Norov

qman_test_stash() calls cpumask_weight() to compare the weight of cpumask
with a given number. We can do it more efficiently with cpumask_weight_lt
because conditional cpumask_weight may stop traversing the cpumask earlier,
as soon as condition is (or can't be) met.

Signed-off-by: Yury Norov 
---
 drivers/soc/fsl/qbman/qman_test_stash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qbman/qman_test_stash.c 
b/drivers/soc/fsl/qbman/qman_test_stash.c
index b7e8e5ec884c..28b08568a349 100644
--- a/drivers/soc/fsl/qbman/qman_test_stash.c
+++ b/drivers/soc/fsl/qbman/qman_test_stash.c
@@ -561,7 +561,7 @@ int qman_test_stash(void)
 {
int err;
 
-   if (cpumask_weight(cpu_online_mask) < 2) {
+   if (cpumask_weight_lt(cpu_online_mask, 2)) {
pr_info("%s(): skip - only 1 CPU\n", __func__);
return 0;
}
-- 
2.32.0

Re: [PATCH] ptrace: Add compat PTRACE_{G,S}ETSIGMASK handlers

2017-07-10 Thread Yury Norov

On Thu, Jun 29, 2017 at 05:26:37PM +0100, James Morse wrote:
> compat_ptrace_request() lacks handlers for PTRACE_{G,S}ETSIGMASK,
> instead using those in ptrace_request(). The compat variant should
> read a compat_sigset_t from userspace instead of ptrace_request()s
> sigset_t.
> 
> While compat_sigset_t is the same size as sigset_t, it is defined as
> 2xu32, instead of a single u64. On a big-endian CPU this means that
> compat_sigset_t is passed to user-space using middle-endianness,
> where the least-significant u32 is written most significant byte
> first.
> 
> If ptrace_request()s code is used userspace will read the most
> significant u32 where it expected the least significant.
> 
> Instead of duplicating ptrace_request()s code as a special case in
> the arch code, handle it here.
 
Hi James,

I tested arm64/ilp32 on top of, and everything is fine.

Yury

Acked-by: Yury Norov 

> CC: Yury Norov 
> CC: Andrey Vagin 
> Reported-by: Zhou Chengming 
> Signed-off-by: James Morse 
> Fixes: 29000caecbe87 ("ptrace: add ability to get/set signal-blocked mask")
> ---
> LTP test case here:
> https://lists.linux.it/pipermail/ltp/2017-June/004932.html
> 
>  kernel/ptrace.c | 52 
>  1 file changed, 40 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 8d2c10714530..a5bebb6713e8 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -843,6 +843,22 @@ static int ptrace_regset(struct task_struct *task, int 
> req, unsigned int type,
>  EXPORT_SYMBOL_GPL(task_user_regset_view);
>  #endif
>  
> +static int ptrace_setsigmask(struct task_struct *child, sigset_t *new_set)
> +{
> + sigdelsetmask(new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
> +
> + /*
> +  * Every thread does recalc_sigpending() after resume, so
> +  * retarget_shared_pending() and recalc_sigpending() are not
> +  * called here.
> +  */
> + spin_lock_irq(&child->sighand->siglock);
> + child->blocked = *new_set;
> + spin_unlock_irq(&child->sighand->siglock);
> +
> + return 0;
> +}
> +
>  int ptrace_request(struct task_struct *child, long request,
>  unsigned long addr, unsigned long data)
>  {
> @@ -914,18 +930,7 @@ int ptrace_request(struct task_struct *child, long 
> request,
>   break;
>   }
>  
> - sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
> -
> - /*
> -  * Every thread does recalc_sigpending() after resume, so
> -  * retarget_shared_pending() and recalc_sigpending() are not
> -  * called here.
> -  */
> - spin_lock_irq(&child->sighand->siglock);
> - child->blocked = new_set;
> - spin_unlock_irq(&child->sighand->siglock);
> -
> - ret = 0;
> + ret = ptrace_setsigmask(child, &new_set);
>   break;
>   }
>  
> @@ -1149,7 +1154,9 @@ int compat_ptrace_request(struct task_struct *child, 
> compat_long_t request,
> compat_ulong_t addr, compat_ulong_t data)
>  {
>   compat_ulong_t __user *datap = compat_ptr(data);
> + compat_sigset_t set32;
>   compat_ulong_t word;
> + sigset_t new_set;
>   siginfo_t siginfo;
>   int ret;
>  
> @@ -1189,6 +1196,27 @@ int compat_ptrace_request(struct task_struct *child, 
> compat_long_t request,
>   else
>   ret = ptrace_setsiginfo(child, &siginfo);
>   break;
> + case PTRACE_GETSIGMASK:
> + if (addr != sizeof(compat_sigset_t))
> + return -EINVAL;
> +
> + sigset_to_compat(&set32, &child->blocked);
> +
> + if (copy_to_user(datap, &set32, sizeof(set32)))
> + return -EFAULT;
> +
> + ret = 0;
> + break;
> + case PTRACE_SETSIGMASK:
> + if (addr != sizeof(compat_sigset_t))
> + return -EINVAL;
> +
> + if (copy_from_user(&set32, datap, sizeof(compat_sigset_t)))
> + return -EFAULT;
> +
> + sigset_from_compat(&new_set, &set32);
> + ret = ptrace_setsigmask(child, &new_set);
> + break;
>  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
>   case PTRACE_GETREGSET:
>   case PTRACE_SETREGSET:
> -- 
> 2.11.0

Re: [PATCH] ptrace: Add compat PTRACE_{G,S}ETSIGMASK handlers

2017-10-13 Thread Yury Norov

Hi James, all,

(add linux-...@vger.kernel.org as it is user-visible,
Catalin Marinas and Arnd Bergmann )

On Thu, Jun 29, 2017 at 05:26:37PM +0100, James Morse wrote:
> compat_ptrace_request() lacks handlers for PTRACE_{G,S}ETSIGMASK,
> instead using those in ptrace_request(). The compat variant should
> read a compat_sigset_t from userspace instead of ptrace_request()s
> sigset_t.
> 
> While compat_sigset_t is the same size as sigset_t, it is defined as
> 2xu32, instead of a single u64. On a big-endian CPU this means that
> compat_sigset_t is passed to user-space using middle-endianness,
> where the least-significant u32 is written most significant byte
> first.
> 
> If ptrace_request()s code is used userspace will read the most
> significant u32 where it expected the least significant.
> 
> Instead of duplicating ptrace_request()s code as a special case in
> the arch code, handle it here.
> 
> CC: Yury Norov 
> CC: Andrey Vagin 
> Reported-by: Zhou Chengming 
> Signed-off-by: James Morse 
> Fixes: 29000caecbe87 ("ptrace: add ability to get/set signal-blocked mask")
> ---
> LTP test case here:
> https://lists.linux.it/pipermail/ltp/2017-June/004932.html

This patch relies on sigset_{to,from}_compat() which was proposed to
remove from the kernel recently. The change is in linux-next, and it
breaks the build of the kenel with this patch. Below the updated
version.

I'd like to ask here again, do we need this change? The patch is
correct, but it changes the ptrace API for compat big-endian
architectures. It normally should stop us from pulling it, but there's
seemingly no users of the API in the wild, and so it will
break nothing.

The problem was originally reported by Zhou Chengming for BE arm64/ilp32.
I would like to see arm64/ilp32 working correct in this case, and
developers of other new architectures probably would so.

Regarding arm64/ilp32, we have agreed ABI, and 4.12 and 4.13 kernels
have this change:
https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/log/?h=staging/ilp32-4.12
https://github.com/norov/linux/tree/ilp32-4.13

So I see 3 ways to proceed with this:
1. Drop the patch and remove it from arm64/ilp32;
2. Apply the patch as is;
3. Introduce new config option like ARCH_PTRACE_COMPAT_BE_SWAP_SIGMASK,
   make it enabled by default and disable explicitly for existing
   compat BE architectures.

I would choose 2 or 3 depending on what maintainers of existing
architectures think.

Yury

Signed-off-by: Yury Norov 
---
 kernel/ptrace.c | 52 
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..1af47a33768e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -880,6 +880,22 @@ static int ptrace_regset(struct task_struct *task, int 
req, unsigned int type,
 EXPORT_SYMBOL_GPL(task_user_regset_view);
 #endif
 
+static int ptrace_setsigmask(struct task_struct *child, sigset_t *new_set)
+{
+   sigdelsetmask(new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+   /*
+* Every thread does recalc_sigpending() after resume, so
+* retarget_shared_pending() and recalc_sigpending() are not
+* called here.
+*/
+   spin_lock_irq(&child->sighand->siglock);
+   child->blocked = *new_set;
+   spin_unlock_irq(&child->sighand->siglock);
+
+   return 0;
+}
+
 int ptrace_request(struct task_struct *child, long request,
   unsigned long addr, unsigned long data)
 {
@@ -951,18 +967,7 @@ int ptrace_request(struct task_struct *child, long request,
break;
}
 
-   sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
-   /*
-* Every thread does recalc_sigpending() after resume, so
-* retarget_shared_pending() and recalc_sigpending() are not
-* called here.
-*/
-   spin_lock_irq(&child->sighand->siglock);
-   child->blocked = new_set;
-   spin_unlock_irq(&child->sighand->siglock);
-
-   ret = 0;
+   ret = ptrace_setsigmask(child, &new_set);
break;
}
 
@@ -1192,6 +1197,7 @@ int compat_ptrace_request(struct task_struct *child, 
compat_long_t request,
 {
compat_ulong_t __user *datap = compat_ptr(data);
compat_ulong_t word;
+   sigset_t new_set;
siginfo_t siginfo;
int ret;
 
@@ -1233,6 +1239,28 @@ int compat_ptrace_request(struct task_struct *child, 
compat_long_t request,
else
ret = ptrace_setsiginfo(child, &siginfo);
break;
+   case PTRACE_GETSIGMASK:
+   if (addr != sizeof(compat_sigset_t))
+   return -EINVAL;
+
+   ret

[PATCH 0/2] smp: don't kick CPUs running idle or nohz_full tasks

2018-03-25 Thread Yury Norov

kick_all_cpus_sync() is used to broadcast IPIs to all online CPUs to force
them sync caches, TLB etc. It is is called only 3 times - from mm/slab,
arm64 and powerpc code.

With framework introduced in patch b8c17e6664c46 ("rcu: Maintain special
bits at bottom of ->dynticks counter") we can delay synchrosization work
for CPUs in extended quiescent state (idle or nohz_full userspace). 

As Paul E. McKenney wrote: 

--

Currently, IPIs are used to force other CPUs to invalidate their TLBs
in response to a kernel virtual-memory mapping change.  This works, but 
degrades both battery lifetime (for idle CPUs) and real-time response
(for nohz_full CPUs), and in addition results in unnecessary IPIs due to
the fact that CPUs executing in usermode are unaffected by stale kernel
mappings.  It would be better to cause a CPU executing in usermode to
wait until it is entering kernel mode to do the flush, first to avoid
interrupting usemode tasks and second to handle multiple flush requests
with a single flush in the case of a long-running user task.

--

For mm/slab and arm64 it looks safe to delay synchronization. This is done
in patch #2 by introducing kick_active_cpus_sync() function. For powerpc -
I'm not sure, and I'd like to ask powerpc people, is it safe to do same
also for that code? If so, we can completely drop kick_all_cpus_sync().

Yury Norov (2):
  rcu: declare rcu_eqs_special_set() in public header
  smp: introduce kick_active_cpus_sync()

 arch/arm64/kernel/insn.c |  2 +-
 include/linux/rcutree.h  |  1 +
 include/linux/smp.h  |  2 ++
 kernel/smp.c | 24 
 mm/slab.c|  2 +-
 5 files changed, 29 insertions(+), 2 deletions(-)

-- 
2.14.1

[PATCH 1/2] rcu: declare rcu_eqs_special_set() in public header

2018-03-25 Thread Yury Norov

rcu_eqs_special_set() is declared only in internal header
kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h.

This patch declares rcu_eqs_special_set() in include/linux/rcutree.h, so
it can be used in non-rcu kernel code.

Signed-off-by: Yury Norov 
---
 include/linux/rcutree.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index fd996cdf1833..448f20f27396 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
 void rcu_barrier(void);
 void rcu_barrier_bh(void);
 void rcu_barrier_sched(void);
+bool rcu_eqs_special_set(int cpu);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
-- 
2.14.1

[PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-03-25 Thread Yury Norov

kick_all_cpus_sync() forces all CPUs to sync caches by sending broadcast IPI.
If CPU is in extended quiescent state (idle task or nohz_full userspace), this
work may be done at the exit of this state. Delaying synchronization helps to
save power if CPU is in idle state and decrease latency for real-time tasks.

This patch introduces kick_active_cpus_sync() and uses it in mm/slab and arm64
code to delay syncronization.

For task isolation (https://lkml.org/lkml/2017/11/3/589), IPI to the CPU running
isolated task would be fatal, as it breaks isolation. The approach with delaying
of synchronization work helps to maintain isolated state.

I've tested it with test from task isolation series on ThunderX2 for more than
10 hours (10k giga-ticks) without breaking isolation.

Signed-off-by: Yury Norov 
---
 arch/arm64/kernel/insn.c |  2 +-
 include/linux/smp.h  |  2 ++
 kernel/smp.c | 24 
 mm/slab.c|  2 +-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 2718a77da165..9d7c492e920e 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -291,7 +291,7 @@ int __kprobes aarch64_insn_patch_text(void *addrs[], u32 
insns[], int cnt)
 * synchronization.
 */
ret = aarch64_insn_patch_text_nosync(addrs[0], 
insns[0]);
-   kick_all_cpus_sync();
+   kick_active_cpus_sync();
return ret;
}
}
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9fb239e12b82..27215e22240d 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -105,6 +105,7 @@ int smp_call_function_any(const struct cpumask *mask,
  smp_call_func_t func, void *info, int wait);
 
 void kick_all_cpus_sync(void);
+void kick_active_cpus_sync(void);
 void wake_up_all_idle_cpus(void);
 
 /*
@@ -161,6 +162,7 @@ smp_call_function_any(const struct cpumask *mask, 
smp_call_func_t func,
 }
 
 static inline void kick_all_cpus_sync(void) {  }
+static inline void kick_active_cpus_sync(void) {  }
 static inline void wake_up_all_idle_cpus(void) {  }
 
 #ifdef CONFIG_UP_LATE_INIT
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..0358d6673850 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -724,6 +724,30 @@ void kick_all_cpus_sync(void)
 }
 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
 
+/**
+ * kick_active_cpus_sync - Force CPUs that are not in extended
+ * quiescent state (idle or nohz_full userspace) sync by sending
+ * IPI. Extended quiescent state CPUs will sync at the exit of
+ * that state.
+ */
+void kick_active_cpus_sync(void)
+{
+   int cpu;
+   struct cpumask kernel_cpus;
+
+   smp_mb();
+
+   cpumask_clear(&kernel_cpus);
+   preempt_disable();
+   for_each_online_cpu(cpu) {
+   if (!rcu_eqs_special_set(cpu))
+   cpumask_set_cpu(cpu, &kernel_cpus);
+   }
+   smp_call_function_many(&kernel_cpus, do_nothing, NULL, 1);
+   preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_active_cpus_sync);
+
 /**
  * wake_up_all_idle_cpus - break all cpus out of idle
  * wake_up_all_idle_cpus try to break all cpus which is in idle state even
diff --git a/mm/slab.c b/mm/slab.c
index 324446621b3e..678d5dbd6f46 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3856,7 +3856,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, 
int limit,
 * cpus, so skip the IPIs.
 */
if (prev)
-   kick_all_cpus_sync();
+   kick_active_cpus_sync();
 
check_irq_on();
cachep->batchcount = batchcount;
-- 
2.14.1

Re: [PATCH 1/2] rcu: declare rcu_eqs_special_set() in public header

2018-03-25 Thread Yury Norov

On Sun, Mar 25, 2018 at 12:12:43PM -0700, Paul E. McKenney wrote:
> On Sun, Mar 25, 2018 at 08:50:03PM +0300, Yury Norov wrote:
> > rcu_eqs_special_set() is declared only in internal header
> > kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h.
> > 
> > This patch declares rcu_eqs_special_set() in include/linux/rcutree.h, so
> > it can be used in non-rcu kernel code.
> > 
> > Signed-off-by: Yury Norov 
> > ---
> >  include/linux/rcutree.h | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
> > index fd996cdf1833..448f20f27396 100644
> > --- a/include/linux/rcutree.h
> > +++ b/include/linux/rcutree.h
> > @@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
> >  void rcu_barrier(void);
> >  void rcu_barrier_bh(void);
> >  void rcu_barrier_sched(void);
> > +bool rcu_eqs_special_set(int cpu);
> >  unsigned long get_state_synchronize_rcu(void);
> >  void cond_synchronize_rcu(unsigned long oldstate);
> >  unsigned long get_state_synchronize_sched(void);
> 
> Good point, a bit hard to use otherwise.  ;-)
> 
> I removed the declaration from rcutree.h and updated the commit log as
> follows.  Does it look OK?
 
Of course.

Thanks,
Yury
 
> 
> 
> commit 4497105b718a819072d48a675916d9d200b5327f
> Author: Yury Norov 
> Date:   Sun Mar 25 20:50:03 2018 +0300
> 
> rcu: Declare rcu_eqs_special_set() in public header
> 
> Because rcu_eqs_special_set() is declared only in internal header
> kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is
> inaccessible outside of the RCU implementation.  This patch therefore
> moves the  rcu_eqs_special_set() declaration to include/linux/rcutree.h,
> which allows it to be used in non-rcu kernel code.
> 
> Signed-off-by: Yury Norov 
> Signed-off-by: Paul E. McKenney 
> 
> diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
> index fd996cdf1833..448f20f27396 100644
> --- a/include/linux/rcutree.h
> +++ b/include/linux/rcutree.h
> @@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
>  void rcu_barrier(void);
>  void rcu_barrier_bh(void);
>  void rcu_barrier_sched(void);
> +bool rcu_eqs_special_set(int cpu);
>  unsigned long get_state_synchronize_rcu(void);
>  void cond_synchronize_rcu(unsigned long oldstate);
>  unsigned long get_state_synchronize_sched(void);
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 59ad0e23c722..d5f617aaa744 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -415,7 +415,6 @@ extern struct rcu_state rcu_preempt_state;
>  #endif /* #ifdef CONFIG_PREEMPT_RCU */
>  
>  int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
> -bool rcu_eqs_special_set(int cpu);
>  
>  #ifdef CONFIG_RCU_BOOST
>  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-03-25 Thread Yury Norov

On Sun, Mar 25, 2018 at 12:23:28PM -0700, Paul E. McKenney wrote:
> On Sun, Mar 25, 2018 at 08:50:04PM +0300, Yury Norov wrote:
> > kick_all_cpus_sync() forces all CPUs to sync caches by sending broadcast 
> > IPI.
> > If CPU is in extended quiescent state (idle task or nohz_full userspace), 
> > this
> > work may be done at the exit of this state. Delaying synchronization helps 
> > to
> > save power if CPU is in idle state and decrease latency for real-time tasks.
> > 
> > This patch introduces kick_active_cpus_sync() and uses it in mm/slab and 
> > arm64
> > code to delay syncronization.
> > 
> > For task isolation (https://lkml.org/lkml/2017/11/3/589), IPI to the CPU 
> > running
> > isolated task would be fatal, as it breaks isolation. The approach with 
> > delaying
> > of synchronization work helps to maintain isolated state.
> > 
> > I've tested it with test from task isolation series on ThunderX2 for more 
> > than
> > 10 hours (10k giga-ticks) without breaking isolation.
> > 
> > Signed-off-by: Yury Norov 
> > ---
> >  arch/arm64/kernel/insn.c |  2 +-
> >  include/linux/smp.h  |  2 ++
> >  kernel/smp.c | 24 
> >  mm/slab.c|  2 +-
> >  4 files changed, 28 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > index 2718a77da165..9d7c492e920e 100644
> > --- a/arch/arm64/kernel/insn.c
> > +++ b/arch/arm64/kernel/insn.c
> > @@ -291,7 +291,7 @@ int __kprobes aarch64_insn_patch_text(void *addrs[], 
> > u32 insns[], int cnt)
> >  * synchronization.
> >  */
> > ret = aarch64_insn_patch_text_nosync(addrs[0], 
> > insns[0]);
> > -   kick_all_cpus_sync();
> > +   kick_active_cpus_sync();
> > return ret;
> > }
> > }
> > diff --git a/include/linux/smp.h b/include/linux/smp.h
> > index 9fb239e12b82..27215e22240d 100644
> > --- a/include/linux/smp.h
> > +++ b/include/linux/smp.h
> > @@ -105,6 +105,7 @@ int smp_call_function_any(const struct cpumask *mask,
> >   smp_call_func_t func, void *info, int wait);
> > 
> >  void kick_all_cpus_sync(void);
> > +void kick_active_cpus_sync(void);
> >  void wake_up_all_idle_cpus(void);
> > 
> >  /*
> > @@ -161,6 +162,7 @@ smp_call_function_any(const struct cpumask *mask, 
> > smp_call_func_t func,
> >  }
> > 
> >  static inline void kick_all_cpus_sync(void) {  }
> > +static inline void kick_active_cpus_sync(void) {  }
> >  static inline void wake_up_all_idle_cpus(void) {  }
> > 
> >  #ifdef CONFIG_UP_LATE_INIT
> > diff --git a/kernel/smp.c b/kernel/smp.c
> > index 084c8b3a2681..0358d6673850 100644
> > --- a/kernel/smp.c
> > +++ b/kernel/smp.c
> > @@ -724,6 +724,30 @@ void kick_all_cpus_sync(void)
> >  }
> >  EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
> > 
> > +/**
> > + * kick_active_cpus_sync - Force CPUs that are not in extended
> > + * quiescent state (idle or nohz_full userspace) sync by sending
> > + * IPI. Extended quiescent state CPUs will sync at the exit of
> > + * that state.
> > + */
> > +void kick_active_cpus_sync(void)
> > +{
> > +   int cpu;
> > +   struct cpumask kernel_cpus;
> > +
> > +   smp_mb();
> > +
> > +   cpumask_clear(&kernel_cpus);
> > +   preempt_disable();
> > +   for_each_online_cpu(cpu) {
> > +   if (!rcu_eqs_special_set(cpu))
> 
> If we get here, the CPU is not in a quiescent state, so we therefore
> must IPI it, correct?
> 
> But don't you also need to define rcu_eqs_special_exit() so that RCU
> can invoke it when it next leaves its quiescent state?  Or are you able
> to ignore the CPU in that case?  (If you are able to ignore the CPU in
> that case, I could give you a lower-cost function to get your job done.)
> 
>   Thanx, Paul

What's actually needed for synchronization is issuing memory barrier on target
CPUs before we start executing kernel code.

smp_mb() is implicitly called in smp_call_function*() path for it. In
rcu_eqs_special_set() -> rcu_dynticks_eqs_exit() path, smp_mb__after_atomic()
is called just before rcu_eqs_special_exit().

So I think, rcu_eqs_special_exit() may be left untouched. Empty
rcu_eqs_special_exit() in new RCU path corresponds empty do_nothing() in old
IPI path.

Or my understanding of

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-03-28 Thread Yury Norov

On Tue, Mar 27, 2018 at 11:21:17AM +0100, Will Deacon wrote:
> On Sun, Mar 25, 2018 at 08:50:04PM +0300, Yury Norov wrote:
> > kick_all_cpus_sync() forces all CPUs to sync caches by sending broadcast 
> > IPI.
> > If CPU is in extended quiescent state (idle task or nohz_full userspace), 
> > this
> > work may be done at the exit of this state. Delaying synchronization helps 
> > to
> > save power if CPU is in idle state and decrease latency for real-time tasks.
> > 
> > This patch introduces kick_active_cpus_sync() and uses it in mm/slab and 
> > arm64
> > code to delay syncronization.
> > 
> > For task isolation (https://lkml.org/lkml/2017/11/3/589), IPI to the CPU 
> > running
> > isolated task would be fatal, as it breaks isolation. The approach with 
> > delaying
> > of synchronization work helps to maintain isolated state.
> > 
> > I've tested it with test from task isolation series on ThunderX2 for more 
> > than
> > 10 hours (10k giga-ticks) without breaking isolation.
> > 
> > Signed-off-by: Yury Norov 
> > ---
> >  arch/arm64/kernel/insn.c |  2 +-
> >  include/linux/smp.h  |  2 ++
> >  kernel/smp.c | 24 
> >  mm/slab.c|  2 +-
> >  4 files changed, 28 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > index 2718a77da165..9d7c492e920e 100644
> > --- a/arch/arm64/kernel/insn.c
> > +++ b/arch/arm64/kernel/insn.c
> > @@ -291,7 +291,7 @@ int __kprobes aarch64_insn_patch_text(void *addrs[], 
> > u32 insns[], int cnt)
> >  * synchronization.
> >  */
> > ret = aarch64_insn_patch_text_nosync(addrs[0], 
> > insns[0]);
> > -   kick_all_cpus_sync();
> > +   kick_active_cpus_sync();
> > return ret;
> > }
> > }
> 
> I think this means that runtime modifications to the kernel text might not
> be picked up by CPUs coming out of idle. Shouldn't we add an ISB on that
> path to avoid executing stale instructions?

Thanks, Will, for the hint. I'll do that.

Yury

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-03-28 Thread Yury Norov

On Mon, Mar 26, 2018 at 02:57:35PM -0400, Steven Rostedt wrote:
> On Mon, 26 Mar 2018 10:53:13 +0200
> Andrea Parri  wrote:
> 
> > > --- a/kernel/smp.c
> > > +++ b/kernel/smp.c
> > > @@ -724,6 +724,30 @@ void kick_all_cpus_sync(void)
> > >  }
> > >  EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
> > >  
> > > +/**
> > > + * kick_active_cpus_sync - Force CPUs that are not in extended
> > > + * quiescent state (idle or nohz_full userspace) sync by sending
> > > + * IPI. Extended quiescent state CPUs will sync at the exit of
> > > + * that state.
> > > + */
> > > +void kick_active_cpus_sync(void)
> > > +{
> > > + int cpu;
> > > + struct cpumask kernel_cpus;
> > > +
> > > + smp_mb();  
> > 
> > (A general remark only:)
> > 
> > checkpatch.pl should have warned about the fact that this barrier is
> > missing an accompanying comment (which accesses are being "ordered",
> > what is the pairing barrier, etc.).
> 
> He could have simply copied the comment above the smp_mb() for
> kick_all_cpus_sync():
> 
>   /* Make sure the change is visible before we kick the cpus */
> 
> The kick itself is pretty much a synchronization primitive.
> 
> That is, you make some changes and then you need all CPUs to see it,
> and you call: kick_active_cpus_synch(), which is the barrier to make
> sure you previous changes are seen on all CPUS before you proceed
> further. Note, the matching barrier is implicit in the IPI itself.
>
>  -- Steve

I know that I had to copy the comment from kick_all_cpus_sync(), but I
don't like copy-pasting in general, and as Steven told, this smp_mb() is
already inside synchronization routine, so we may hope that users of
kick_*_cpus_sync() will explain better what for they need it...
 
> 
> > 
> > Moreover if, as your reply above suggested, your patch is relying on
> > "implicit barriers" (something I would not recommend) then even more
> > so you should comment on these requirements.
> > 
> > This could: (a) force you to reason about the memory ordering stuff,
> > (b) easy the task of reviewing and adopting your patch, (c) easy the
> > task of preserving those requirements (as implementations changes).
> > 
> >   Andrea

I need v2 anyway, and I will add comments to address all questions in this
thread.

I also hope that we'll agree that for powerpc it's also safe to delay
synchronization, and if so, we will have no users of kick_all_cpus_sync(),
and can drop it.

(It looks like this, because nohz_full userspace CPU cannot have pending
IPIs, but I'd like to get confirmation from powerpc people.)

Would it make sense to rename kick_all_cpus_sync() to smp_mb_sync(), which
would stand for 'synchronous memory barrier on all online CPUs'?

Yury

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-03-28 Thread Yury Norov

On Mon, Mar 26, 2018 at 05:45:55AM -0700, Paul E. McKenney wrote:
> On Sun, Mar 25, 2018 at 11:11:54PM +0300, Yury Norov wrote:
> > On Sun, Mar 25, 2018 at 12:23:28PM -0700, Paul E. McKenney wrote:
> > > On Sun, Mar 25, 2018 at 08:50:04PM +0300, Yury Norov wrote:
> > > > kick_all_cpus_sync() forces all CPUs to sync caches by sending 
> > > > broadcast IPI.
> > > > If CPU is in extended quiescent state (idle task or nohz_full 
> > > > userspace), this
> > > > work may be done at the exit of this state. Delaying synchronization 
> > > > helps to
> > > > save power if CPU is in idle state and decrease latency for real-time 
> > > > tasks.
> > > > 
> > > > This patch introduces kick_active_cpus_sync() and uses it in mm/slab 
> > > > and arm64
> > > > code to delay syncronization.
> > > > 
> > > > For task isolation (https://lkml.org/lkml/2017/11/3/589), IPI to the 
> > > > CPU running
> > > > isolated task would be fatal, as it breaks isolation. The approach with 
> > > > delaying
> > > > of synchronization work helps to maintain isolated state.
> > > > 
> > > > I've tested it with test from task isolation series on ThunderX2 for 
> > > > more than
> > > > 10 hours (10k giga-ticks) without breaking isolation.
> > > > 
> > > > Signed-off-by: Yury Norov 
> > > > ---
> > > >  arch/arm64/kernel/insn.c |  2 +-
> > > >  include/linux/smp.h  |  2 ++
> > > >  kernel/smp.c | 24 
> > > >  mm/slab.c|  2 +-
> > > >  4 files changed, 28 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > > > index 2718a77da165..9d7c492e920e 100644
> > > > --- a/arch/arm64/kernel/insn.c
> > > > +++ b/arch/arm64/kernel/insn.c
> > > > @@ -291,7 +291,7 @@ int __kprobes aarch64_insn_patch_text(void 
> > > > *addrs[], u32 insns[], int cnt)
> > > >  * synchronization.
> > > >  */
> > > > ret = aarch64_insn_patch_text_nosync(addrs[0], 
> > > > insns[0]);
> > > > -   kick_all_cpus_sync();
> > > > +   kick_active_cpus_sync();
> > > > return ret;
> > > > }
> > > > }
> > > > diff --git a/include/linux/smp.h b/include/linux/smp.h
> > > > index 9fb239e12b82..27215e22240d 100644
> > > > --- a/include/linux/smp.h
> > > > +++ b/include/linux/smp.h
> > > > @@ -105,6 +105,7 @@ int smp_call_function_any(const struct cpumask 
> > > > *mask,
> > > >   smp_call_func_t func, void *info, int wait);
> > > > 
> > > >  void kick_all_cpus_sync(void);
> > > > +void kick_active_cpus_sync(void);
> > > >  void wake_up_all_idle_cpus(void);
> > > > 
> > > >  /*
> > > > @@ -161,6 +162,7 @@ smp_call_function_any(const struct cpumask *mask, 
> > > > smp_call_func_t func,
> > > >  }
> > > > 
> > > >  static inline void kick_all_cpus_sync(void) {  }
> > > > +static inline void kick_active_cpus_sync(void) {  }
> > > >  static inline void wake_up_all_idle_cpus(void) {  }
> > > > 
> > > >  #ifdef CONFIG_UP_LATE_INIT
> > > > diff --git a/kernel/smp.c b/kernel/smp.c
> > > > index 084c8b3a2681..0358d6673850 100644
> > > > --- a/kernel/smp.c
> > > > +++ b/kernel/smp.c
> > > > @@ -724,6 +724,30 @@ void kick_all_cpus_sync(void)
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
> > > > 
> > > > +/**
> > > > + * kick_active_cpus_sync - Force CPUs that are not in extended
> > > > + * quiescent state (idle or nohz_full userspace) sync by sending
> > > > + * IPI. Extended quiescent state CPUs will sync at the exit of
> > > > + * that state.
> > > > + */
> > > > +void kick_active_cpus_sync(void)
> > > > +{
> > > > +   int cpu;
> > > > +   struct cpumask kernel_cpus;
> > > > +
> > > > +   smp_mb();
> > > > +
> > > > +   cpumask_clear(&kernel_cpus);
> > > > +   preempt_d

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-03-28 Thread Yury Norov

On Wed, Mar 28, 2018 at 06:56:17AM -0700, Paul E. McKenney wrote:
> On Wed, Mar 28, 2018 at 04:36:05PM +0300, Yury Norov wrote:
> > On Mon, Mar 26, 2018 at 05:45:55AM -0700, Paul E. McKenney wrote:
> > > On Sun, Mar 25, 2018 at 11:11:54PM +0300, Yury Norov wrote:
> > > > On Sun, Mar 25, 2018 at 12:23:28PM -0700, Paul E. McKenney wrote:
> > > > > On Sun, Mar 25, 2018 at 08:50:04PM +0300, Yury Norov wrote:
> > > > > > kick_all_cpus_sync() forces all CPUs to sync caches by sending 
> > > > > > broadcast IPI.
> > > > > > If CPU is in extended quiescent state (idle task or nohz_full 
> > > > > > userspace), this
> > > > > > work may be done at the exit of this state. Delaying 
> > > > > > synchronization helps to
> > > > > > save power if CPU is in idle state and decrease latency for 
> > > > > > real-time tasks.
> > > > > > 
> > > > > > This patch introduces kick_active_cpus_sync() and uses it in 
> > > > > > mm/slab and arm64
> > > > > > code to delay syncronization.
> > > > > > 
> > > > > > For task isolation (https://lkml.org/lkml/2017/11/3/589), IPI to 
> > > > > > the CPU running
> > > > > > isolated task would be fatal, as it breaks isolation. The approach 
> > > > > > with delaying
> > > > > > of synchronization work helps to maintain isolated state.
> > > > > > 
> > > > > > I've tested it with test from task isolation series on ThunderX2 
> > > > > > for more than
> > > > > > 10 hours (10k giga-ticks) without breaking isolation.
> > > > > > 
> > > > > > Signed-off-by: Yury Norov 
> > > > > > ---
> > > > > >  arch/arm64/kernel/insn.c |  2 +-
> > > > > >  include/linux/smp.h  |  2 ++
> > > > > >  kernel/smp.c | 24 
> > > > > >  mm/slab.c|  2 +-
> > > > > >  4 files changed, 28 insertions(+), 2 deletions(-)
> > > > > > 
> > > > > > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > > > > > index 2718a77da165..9d7c492e920e 100644
> > > > > > --- a/arch/arm64/kernel/insn.c
> > > > > > +++ b/arch/arm64/kernel/insn.c
> > > > > > @@ -291,7 +291,7 @@ int __kprobes aarch64_insn_patch_text(void 
> > > > > > *addrs[], u32 insns[], int cnt)
> > > > > >  * synchronization.
> > > > > >  */
> > > > > > ret = aarch64_insn_patch_text_nosync(addrs[0], 
> > > > > > insns[0]);
> > > > > > -   kick_all_cpus_sync();
> > > > > > +   kick_active_cpus_sync();
> > > > > > return ret;
> > > > > > }
> > > > > > }
> > > > > > diff --git a/include/linux/smp.h b/include/linux/smp.h
> > > > > > index 9fb239e12b82..27215e22240d 100644
> > > > > > --- a/include/linux/smp.h
> > > > > > +++ b/include/linux/smp.h
> > > > > > @@ -105,6 +105,7 @@ int smp_call_function_any(const struct cpumask 
> > > > > > *mask,
> > > > > >   smp_call_func_t func, void *info, int wait);
> > > > > > 
> > > > > >  void kick_all_cpus_sync(void);
> > > > > > +void kick_active_cpus_sync(void);
> > > > > >  void wake_up_all_idle_cpus(void);
> > > > > > 
> > > > > >  /*
> > > > > > @@ -161,6 +162,7 @@ smp_call_function_any(const struct cpumask 
> > > > > > *mask, smp_call_func_t func,
> > > > > >  }
> > > > > > 
> > > > > >  static inline void kick_all_cpus_sync(void) {  }
> > > > > > +static inline void kick_active_cpus_sync(void) {  }
> > > > > >  static inline void wake_up_all_idle_cpus(void) {  }
> > > > > > 
> > > > > >  #ifdef CONFIG_UP_LATE_INIT
> > > > > > diff --git a/kernel/smp.c b/kernel/smp.c
> > > > > > index 084c8b3a2681..0358d6673850 100644
> > > > > &

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-04-01 Thread Yury Norov

On Tue, Mar 27, 2018 at 11:21:17AM +0100, Will Deacon wrote:
> On Sun, Mar 25, 2018 at 08:50:04PM +0300, Yury Norov wrote:
> > kick_all_cpus_sync() forces all CPUs to sync caches by sending broadcast 
> > IPI.
> > If CPU is in extended quiescent state (idle task or nohz_full userspace), 
> > this
> > work may be done at the exit of this state. Delaying synchronization helps 
> > to
> > save power if CPU is in idle state and decrease latency for real-time tasks.
> > 
> > This patch introduces kick_active_cpus_sync() and uses it in mm/slab and 
> > arm64
> > code to delay syncronization.
> > 
> > For task isolation (https://lkml.org/lkml/2017/11/3/589), IPI to the CPU 
> > running
> > isolated task would be fatal, as it breaks isolation. The approach with 
> > delaying
> > of synchronization work helps to maintain isolated state.
> > 
> > I've tested it with test from task isolation series on ThunderX2 for more 
> > than
> > 10 hours (10k giga-ticks) without breaking isolation.
> > 
> > Signed-off-by: Yury Norov 
> > ---
> >  arch/arm64/kernel/insn.c |  2 +-
> >  include/linux/smp.h  |  2 ++
> >  kernel/smp.c | 24 
> >  mm/slab.c|  2 +-
> >  4 files changed, 28 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
> > index 2718a77da165..9d7c492e920e 100644
> > --- a/arch/arm64/kernel/insn.c
> > +++ b/arch/arm64/kernel/insn.c
> > @@ -291,7 +291,7 @@ int __kprobes aarch64_insn_patch_text(void *addrs[], 
> > u32 insns[], int cnt)
> >  * synchronization.
> >  */
> > ret = aarch64_insn_patch_text_nosync(addrs[0], 
> > insns[0]);
> > -   kick_all_cpus_sync();
> > +   kick_active_cpus_sync();
> > return ret;
> > }
> >     }
> 
> I think this means that runtime modifications to the kernel text might not
> be picked up by CPUs coming out of idle. Shouldn't we add an ISB on that
> path to avoid executing stale instructions?
> 
> Will

commit 153ae9d5667e7baab4d48c48e8ec30fbcbd86f1e
Author: Yury Norov 
Date:   Sat Mar 31 15:05:23 2018 +0300

Hi Will, Paul,

On my system there are 3 paths that go thru rcu_dynticks_eqs_exit(),
and so require isb().

First path starts at gic_handle_irq() on secondary_start_kernel stack.
gic_handle_irq() already issues isb(), and so we can do nothing.

Second path starts at el0_svc entry; and third path is the exit from
do_idle() on secondary_start_kernel stack.

For do_idle() path there is arch_cpu_idle_exit() hook that is not used by
arm64 at now, so I picked it. And for el0_svc, I've introduced isb_if_eqs
macro and call it at the beginning of el0_svc_naked.

I've tested it on ThunderX2 machine, and it works for me.

Below is my call traces and patch for them. If you OK with it, I think I'm
ready to submit v2 (but maybe split this patch for better readability).

Yury

[  585.412095] Call trace:
[  585.412097] [] dump_backtrace+0x0/0x380
[  585.412099] [] show_stack+0x14/0x20
[  585.412101] [] dump_stack+0x98/0xbc
[  585.412104] [] rcu_dynticks_eqs_exit+0x68/0x70
[  585.412105] [] rcu_irq_enter+0x48/0x50
[  585.412106] [] irq_enter+0xc/0x70
[  585.412108] [] __handle_domain_irq+0x3c/0x120
[  585.412109] [] gic_handle_irq+0xc4/0x180
[  585.412110] Exception stack(0xfc001130fe20 to 0xfc001130ff60)
[  585.412112] fe20: 00a0  0001 

[  585.412113] fe40: 028f6f0b 0020 0013cd6f53963b31 

[  585.412144] fe60: 0002 fc001130fed0 0b80 
3400
[  585.412146] fe80:  0001  
01db
[  585.412147] fea0: fc0008247a78 03ff86dc61f8 0014 
fc0008fc
[  585.412149] fec0: fc00090143e8 fc0009014000 fc0008fc94a0 

[  585.412150] fee0:  fe8f46bb1700  

[  585.412152] ff00:  fc001130ff60 fc0008085034 
fc001130ff60
[  585.412153] ff20: fc0008085038 00400149 fc0009014000 
fc0008fc94a0
[  585.412155] ff40:   fc001130ff60 
fc0008085038
[  585.412156] [] el1_irq+0xb0/0x124
[  585.412158] [] arch_cpu_idle+0x10/0x18
[  585.412159] [] do_idle+0x10c/0x1d8
[  585.412160] [] cpu_startup_entry+0x24/0x28
[  585.412162] [] secondary_start_kernel+0x15c/0x1a0
[  585.412164] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 
4.14.0-isolation-160735-g59b71c1-dirty #18

[  585.412058] Call trace:
[  585.412060] []

Re: [PATCH 2/2] smp: introduce kick_active_cpus_sync()

2018-04-03 Thread Yury Norov

Hi Mark,

Thank you for review.

On Tue, Apr 03, 2018 at 02:48:32PM +0100, Mark Rutland wrote:
> Hi Yury,
> 
> On Sun, Apr 01, 2018 at 02:11:08PM +0300, Yury Norov wrote:
> > +/*
> > + * Flush I-cache if CPU is in extended quiescent state
> > + */
> 
> This comment is misleading. An ISB doesn't touch the I-cache; it forces
> a context synchronization event.
> 
> > +   .macro  isb_if_eqs
> > +#ifndef CONFIG_TINY_RCU
> > +   bl  rcu_is_watching
> > +   tst w0, #0xff
> > +   b.ne1f
> 
> The TST+B.NE can be a CBNZ:
> 
>   bl  rcu_is_watching
>   cbnzx0, 1f
>   isb
> 1:
> 
> > +   /* Pairs with aarch64_insn_patch_text for EQS CPUs. */
> > +   isb
> > +1:
> > +#endif
> > +   .endm
> > +
> >  el0_sync_invalid:
> > inv_entry 0, BAD_SYNC
> >  ENDPROC(el0_sync_invalid)
> > @@ -840,8 +861,10 @@ el0_svc:
> > mov wsc_nr, #__NR_syscalls
> >  el0_svc_naked: // compat entry point
> > stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and 
> > syscall number
> > +   isb_if_eqs
> > enable_dbg_and_irq
> > -   ct_user_exit 1
> > +   ct_user_exit
> 
> I don't think this is safe. here we issue the ISB *before* exiting a
> quiesecent state, so I think we can race with another CPU that calls
> kick_all_active_cpus_sync, e.g.
> 
>   CPU0CPU1
> 
>   ISB
>   patch_some_text()
>   kick_all_active_cpus_sync()
>   ct_user_exit
> 
>   // not synchronized!
>   use_of_patched_text()
> 
> ... and therefore the ISB has no effect, which could be disasterous.
> 
> I believe we need the ISB *after* we transition into a non-quiescent
> state, so that we can't possibly miss a context synchronization event.
 
I decided to put isb() in entry because there's a chance that there will
be patched code prior to exiting a quiescent state. But after some
headscratching, I think it's safe. I'll do like you suggested here.

Thanks,
Yury

[PATCH v2 0/2] smp: don't kick CPUs running idle or nohz_full tasks

2018-04-05 Thread Yury Norov

kick_all_cpus_sync() is used to broadcast IPIs to all online CPUs to force
them synchronize caches, TLB etc. It is called only 3 times - from mm/slab
arm64 and powerpc code.

We can delay synchronization work for CPUs in extended quiescent state
(idle or nohz_full userspace). 

As Paul E. McKenney wrote: 

--

Currently, IPIs are used to force other CPUs to invalidate their TLBs
in response to a kernel virtual-memory mapping change.  This works, but 
degrades both battery lifetime (for idle CPUs) and real-time response
(for nohz_full CPUs), and in addition results in unnecessary IPIs due to
the fact that CPUs executing in usermode are unaffected by stale kernel
mappings.  It would be better to cause a CPU executing in usermode to
wait until it is entering kernel mode to do the flush, first to avoid
interrupting usemode tasks and second to handle multiple flush requests
with a single flush in the case of a long-running user task.

--

v2 is big rework to address comments in v1:
 - rcu_eqs_special() declaration in public header is dropped, it is not
   used in new implementation. Though, I hope Paul will pick it in his
   tree;
 - for arm64, few isb() added to ensure kernel text synchronization
   (patches 1-4);
 - rcu_get_eqs_cpus() introduced and used to mask EQS CPUs before 
   generating broadcast IPIs;
 - RCU_DYNTICK_CTRL_MASK is not touched because memory barrier is
   implicitly issued in EQS exit path;
 - powerpc is not an exception anymore. I think it's safe to delay
   synchronization for it as well, and I didn't get comments from ppc
   community.
v1:
  https://lkml.org/lkml/2018/3/25/109

Based on next-20180405

Yury Norov (5):
  arm64: entry: isb in el1_irq
  arm64: entry: introduce restore_syscall_args macro
  arm64: ISB early at exit from extended quiescent state
  rcu: arm64: add rcu_dynticks_eqs_exit_sync()
  smp: Lazy synchronization for EQS CPUs in kick_all_cpus_sync()

 arch/arm64/kernel/Makefile  |  2 ++
 arch/arm64/kernel/entry.S   | 52 +++--
 arch/arm64/kernel/process.c |  7 ++
 arch/arm64/kernel/rcu.c |  8 +++
 include/linux/rcutiny.h |  2 ++
 include/linux/rcutree.h |  1 +
 kernel/rcu/tiny.c   |  9 
 kernel/rcu/tree.c   | 27 +++
 kernel/smp.c| 21 +++---
 9 files changed, 105 insertions(+), 24 deletions(-)
 create mode 100644 arch/arm64/kernel/rcu.c

-- 
2.14.1

[PATCH 1/5] arm64: entry: isb in el1_irq

2018-04-05 Thread Yury Norov

Kernel text patching framework relies on IPI to ensure that other
SMP cores observe the change. Target core calls isb() in IPI handler
path, but not at the beginning of el1_irq entry. There's a chance
that modified instruction will appear prior isb(), and so will not be
observed.

This patch inserts isb early at el1_irq entry to avoid that chance.

Signed-off-by: Yury Norov 
---
 arch/arm64/kernel/entry.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ec2ee720e33e..9c06b4b80060 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -593,6 +593,7 @@ ENDPROC(el1_sync)
 
.align  6
 el1_irq:
+   isb // pairs with 
aarch64_insn_patch_text
kernel_entry 1
enable_da_f
 #ifdef CONFIG_TRACE_IRQFLAGS
-- 
2.14.1

[PATCH 2/5] arm64: entry: introduce restore_syscall_args macro

2018-04-05 Thread Yury Norov

Syscall arguments are passed in registers x0..x7. If assembler
code has to call C functions before passing control to syscall
handler, it should restore original state of that registers
after the call.

Currently, syscall arguments restoring is opencoded in el0_svc_naked
and __sys_trace. This patch introduces restore_syscall_args macro to
use it there.

Also, parameter 'syscall = 0' is removed from ct_user_exit to make
el0_svc_naked call restore_syscall_args explicitly. This is needed
because the following patch of the series adds another call to C
function in el0_svc_naked, and restoring of syscall args becomes not
only a matter of ct_user_exit.

Signed-off-by: Yury Norov 
---
 arch/arm64/kernel/entry.S | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 9c06b4b80060..c8d9ec363ddd 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -37,22 +37,29 @@
 #include 
 
 /*
- * Context tracking subsystem.  Used to instrument transitions
- * between user and kernel mode.
+ * Save/restore needed during syscalls.  Restore syscall arguments from
+ * the values already saved on stack during kernel_entry.
  */
-   .macro ct_user_exit, syscall = 0
-#ifdef CONFIG_CONTEXT_TRACKING
-   bl  context_tracking_user_exit
-   .if \syscall == 1
-   /*
-* Save/restore needed during syscalls.  Restore syscall arguments from
-* the values already saved on stack during kernel_entry.
-*/
+   .macro restore_syscall_args
ldp x0, x1, [sp]
ldp x2, x3, [sp, #S_X2]
ldp x4, x5, [sp, #S_X4]
ldp x6, x7, [sp, #S_X6]
-   .endif
+   .endm
+
+   .macro el0_svc_restore_syscall_args
+#if defined(CONFIG_CONTEXT_TRACKING)
+   restore_syscall_args
+#endif
+   .endm
+
+/*
+ * Context tracking subsystem.  Used to instrument transitions
+ * between user and kernel mode.
+ */
+   .macro ct_user_exit
+#ifdef CONFIG_CONTEXT_TRACKING
+   bl  context_tracking_user_exit
 #endif
.endm
 
@@ -943,7 +950,8 @@ alternative_else_nop_endif
 el0_svc_naked: // compat entry point
stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and 
syscall number
enable_daif
-   ct_user_exit 1
+   ct_user_exit
+   el0_svc_restore_syscall_args
 
tst x16, #_TIF_SYSCALL_WORK // check for syscall hooks
b.ne__sys_trace
@@ -976,10 +984,7 @@ __sys_trace:
mov x1, sp  // pointer to regs
cmp wscno, wsc_nr   // check upper syscall limit
b.hs__ni_sys_trace
-   ldp x0, x1, [sp]// restore the syscall args
-   ldp x2, x3, [sp, #S_X2]
-   ldp x4, x5, [sp, #S_X4]
-   ldp x6, x7, [sp, #S_X6]
+   restore_syscall_args
ldr x16, [stbl, xscno, lsl #3]  // address in the syscall table
blr x16 // call sys_* routine
 
-- 
2.14.1

[PATCH 3/5] arm64: early ISB at exit from extended quiescent state

2018-04-05 Thread Yury Norov

This series enables delaying of kernel memory synchronization
for CPUs running in extended quiescent state (EQS) till the exit
of that state.

ARM64 uses IPI mechanism to notify all cores in  SMP system that
kernel text is changed; and IPI handler calls isb() to synchronize.

If we don't deliver IPI to EQS CPUs anymore, we should add ISB early
in EQS exit path.

There are 2 such paths. One starts in do_idle() loop, and other
in el0_svc entry. For do_idle(), isb() is added in
arch_cpu_idle_exit() hook. And for SVC handler, isb is called in
el0_svc_naked.

Suggested-by: Will Deacon 
Signed-off-by: Yury Norov 
---
 arch/arm64/kernel/entry.S   | 16 +++-
 arch/arm64/kernel/process.c |  7 +++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index c8d9ec363ddd..b1e1c19b4432 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -48,7 +48,7 @@
.endm
 
.macro el0_svc_restore_syscall_args
-#if defined(CONFIG_CONTEXT_TRACKING)
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_CONTEXT_TRACKING)
restore_syscall_args
 #endif
.endm
@@ -483,6 +483,19 @@ __bad_stack:
ASM_BUG()
.endm
 
+/*
+ * If CPU is in extended quiescent state we need isb to ensure that
+ * possible change of kernel text is visible by the core.
+ */
+   .macro  isb_if_eqs
+#ifndef CONFIG_TINY_RCU
+   bl  rcu_is_watching
+   cbnzx0, 1f
+   isb // pairs with 
aarch64_insn_patch_text
+1:
+#endif
+   .endm
+
 el0_sync_invalid:
inv_entry 0, BAD_SYNC
 ENDPROC(el0_sync_invalid)
@@ -949,6 +962,7 @@ alternative_else_nop_endif
 
 el0_svc_naked: // compat entry point
stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and 
syscall number
+   isb_if_eqs
enable_daif
ct_user_exit
el0_svc_restore_syscall_args
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f08a2ed9db0d..74cad496b07b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -88,6 +88,13 @@ void arch_cpu_idle(void)
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 }
 
+void arch_cpu_idle_exit(void)
+{
+   /* Pairs with aarch64_insn_patch_text() for EQS CPUs. */
+   if (!rcu_is_watching())
+   isb();
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 void arch_cpu_idle_dead(void)
 {
-- 
2.14.1

1 2 >

1 - 100 of 151 matches

Mail list logo