Hi all, It seems that inlining the _find_next_bit() helper makes find_next_bit() and find_next_zero_bit() 2 times faster at the scenario of finding all set/cleared bits of randomly initialised bitmap.
For another typical scenario of traversing sparse bitmap there is also measurable improvement observed, about 15%. The increasing of text size of find_bit.o module is 40 bytes for arm64 - from 252 to 292 bytes - is looking acceptable. This patch also contains test module. Measured on ThunderX machine. Tests for other architectures are very appreciated. Before: [ 96.856195] Start testing find_bit() with random-filled bitmap [ 96.868322] find_next_bit: 34529 cycles, 16304 iterations [ 96.879525] find_next_zero_bit: 35771 cycles, 16465 iterations [ 96.891409] find_last_bit: 17444 cycles, 16304 iterations [ 96.914445] find_first_bit: 1219671 cycles, 16305 iterations [ 96.925802] Start testing find_bit() with sparse bitmap [ 96.936308] find_next_bit: 301 cycles, 66 iterations [ 96.946981] find_next_zero_bit: 70897 cycles, 32703 iterations [ 96.958694] find_last_bit: 286 cycles, 66 iterations [ 96.968710] find_first_bit: 5260 cycles, 66 iterations After: [ 169.464229] Start testing find_bit() with random-filled bitmap [ 169.476191] find_next_bit: 17520 cycles, 16336 iterations [ 169.487210] find_next_zero_bit: 17622 cycles, 16433 iterations [ 169.499111] find_last_bit: 19272 cycles, 16335 iterations [ 169.519735] find_first_bit: 978657 cycles, 16337 iterations [ 169.530912] Start testing find_bit() with sparse bitmap [ 169.541414] find_next_bit: 252 cycles, 66 iterations [ 169.551726] find_next_zero_bit: 34554 cycles, 32703 iterations [ 169.563436] find_last_bit: 294 cycles, 66 iterations [ 169.573439] find_first_bit: 3964 cycles, 66 iterations CC: Alexey Dobriyan <adobri...@gmail.com> CC: Andrew Morton <a...@linux-foundation.org> CC: Clement Courbet <cour...@google.com> CC: Matthew Wilcox <mawil...@microsoft.com> CC: Rasmus Villemoes <li...@rasmusvillemoes.dk> Signed-off-by: Yury Norov <yno...@caviumnetworks.com> --- lib/Kconfig.debug | 9 ++++ lib/Makefile | 1 + lib/find_bit.c | 2 +- lib/test_find_bit.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/find_bit.c | 2 +- 5 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 lib/test_find_bit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dfdad67d8f6c..138034cc68a3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1838,6 +1838,15 @@ config TEST_BPF If unsure, say N. +config TEST_FIND_BIT + tristate "Test find_bit functions" + default n + help + This builds the "test_find_bit" module that measure find_*_bit() + functions performance. + + If unsure, say N. + config TEST_FIRMWARE tristate "Test firmware loading via userspace interface" default n diff --git a/lib/Makefile b/lib/Makefile index dafa79613fb4..edb792b42c86 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -45,6 +45,7 @@ obj-y += hexdump.o obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o +obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o diff --git a/lib/find_bit.c b/lib/find_bit.c index 6ed74f78380c..9b0c89f3fd3a 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -28,7 +28,7 @@ * find_next_zero_bit. The difference is the "invert" argument, which * is XORed with each fetched word before searching it for one bits. */ -static unsigned long _find_next_bit(const unsigned long *addr, +static inline unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start, unsigned long invert) { unsigned long tmp; diff --git a/lib/test_find_bit.c b/lib/test_find_bit.c new file mode 100644 index 000000000000..8eaf10cae214 --- /dev/null +++ b/lib/test_find_bit.c @@ -0,0 +1,141 @@ +/* + * Test for find_*_bit functions. + * + * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* + * find_bit functions are widely used in kernel, so the successful boot + * is good enough test for correctness. + * + * This test is focused on performance of traversing bitmaps. Two typical + * scenarios are reproduced: + * - randomly filled bitmap with approximately equal number of set and + * cleared bits; + * - sparse bitmap with few set bits at random positions. + */ + +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/random.h> + +#define BITMAP_LEN (4096UL * 8) +#define SPARSE (500) + +static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata; + +/* + * This is Schlemiel the Painter's algorithm. + * Should be called after all other tests for the same data + * because it sets all bits of bitmap to 1. + */ +static int __init test_find_first_bit(void *bitmap, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < len; cnt++) { + i = find_first_bit(bitmap, len); + __clear_bit(i, bitmap); + } + cycles = get_cycles() - cycles; + pr_err("find_first_bit: %ld cycles, %ld iterations\n", cycles, cnt); + + return 0; +} + +static int __init test_find_next_bit(const void *bitmap, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < BITMAP_LEN; cnt++) + i = find_next_bit(bitmap, BITMAP_LEN, i) + 1; + cycles = get_cycles() - cycles; + pr_err("find_next_bit: %ld cycles, %ld iterations\n", cycles, cnt); + + return 0; +} + +static int __init test_find_next_zero_bit(const void *bitmap, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < BITMAP_LEN; cnt++) + i = find_next_zero_bit(bitmap, len, i) + 1; + cycles = get_cycles() - cycles; + pr_err("find_next_zero_bit: %ld cycles, %ld iterations\n", cycles, cnt); + + return 0; +} + +static int __init test_find_last_bit(const void *bitmap, unsigned long len) +{ + unsigned long l, cnt = 0; + cycles_t cycles; + + cycles = get_cycles(); + do { + cnt++; + l = find_last_bit(bitmap, len); + if (l >= len) + break; + len = l; + } while (len); + cycles = get_cycles() - cycles; + pr_err("find_last_bit: %ld cycles, %ld iterations\n", cycles, cnt); + + return 0; +} + +static int __init find_bit_test(void) +{ + unsigned long nbits = BITMAP_LEN / SPARSE; + + pr_err("Start testing find_bit() with random-filled bitmap\n"); + + get_random_bytes(bitmap, sizeof(bitmap)); + + test_find_next_bit(bitmap, BITMAP_LEN); + test_find_next_zero_bit(bitmap, BITMAP_LEN); + test_find_last_bit(bitmap, BITMAP_LEN); + test_find_first_bit(bitmap, BITMAP_LEN); + + pr_err("Start testing find_bit() with sparse bitmap\n"); + + bitmap_zero(bitmap, BITMAP_LEN); + + while (nbits--) + __set_bit(prandom_u32() % BITMAP_LEN, bitmap); + + test_find_next_bit(bitmap, BITMAP_LEN); + test_find_next_zero_bit(bitmap, BITMAP_LEN); + test_find_last_bit(bitmap, BITMAP_LEN); + test_find_first_bit(bitmap, BITMAP_LEN); + + return 0; +} +module_init(find_bit_test); + +static void __exit test_find_bit_cleanup(void) +{ +} +module_exit(test_find_bit_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 42c15f906aac..6873b5fa221b 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -29,7 +29,7 @@ * find_next_zero_bit. The difference is the "invert" argument, which * is XORed with each fetched word before searching it for one bits. */ -static unsigned long _find_next_bit(const unsigned long *addr, +static inline unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start, unsigned long invert) { unsigned long tmp; -- 2.11.0