[PATCH net-next v2 7/8] net: pktgen: fix access outside of user given buffer in pktgen_if_write()
Honour the user given buffer size for the hex32_arg(), num_arg(), strn_len(), get_imix_entries() and get_labels() calls (otherwise they will access memory outside of the user given buffer). In hex32_arg(), num_arg(), strn_len() error out in case no characters are available (maxlen = 0), in num_arg() additional error out in case no valid character is parsed. In get_labels() additional enable parsing labels up to MAX_IMIX_ENTRIES instead of (MAX_IMIX_ENTRIES - 1). Additional remove some superfluous variable initializing and align some variable declarations to the most common pattern. Signed-off-by: Peter Seiderer --- Changes v1 -> v2: - additional fix get_imix_entries() and get_labels() --- net/core/pktgen.c | 213 ++ 1 file changed, 140 insertions(+), 73 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9fe2a2db0d34..1fc037641610 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -750,6 +750,9 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, int i = 0; *num = 0; + if (!maxlen) + return -EINVAL; + for (; i < maxlen; i++) { int value; char c; @@ -797,6 +800,9 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, int i; *num = 0; + if (!maxlen) + return -EINVAL; + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) @@ -804,6 +810,9 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, if ((c >= '0') && (c <= '9')) { *num *= 10; *num += c - '0'; + } else if (i == 0) { + // no valid character parsed, error out + return -EINVAL; } else break; } @@ -814,6 +823,9 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) { int i; + if (!maxlen) + return -EINVAL; + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) @@ -840,11 +852,10 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. */ static ssize_t get_imix_entries(const char __user *buffer, + unsigned int maxlen, struct pktgen_dev *pkt_dev) { - const int max_digits = 10; - int i = 0; - long len; + int i = 0, max, len; char c; pkt_dev->n_imix_entries = 0; @@ -856,10 +867,13 @@ static ssize_t get_imix_entries(const char __user *buffer, if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) return -E2BIG; - len = num_arg(&buffer[i], max_digits, &size); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &size); if (len < 0) return len; i += len; + if (i >= maxlen) + return -EINVAL; if (get_user(c, &buffer[i])) return -EFAULT; /* Check for comma between size_i and weight_i */ @@ -870,7 +884,8 @@ static ssize_t get_imix_entries(const char __user *buffer, if (size < 14 + 20 + 8) size = 14 + 20 + 8; - len = num_arg(&buffer[i], max_digits, &weight); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &weight); if (len < 0) return len; if (weight <= 0) @@ -880,39 +895,45 @@ static ssize_t get_imix_entries(const char __user *buffer, pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; i += len; + pkt_dev->n_imix_entries++; + + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; - i++; - pkt_dev->n_imix_entries++; } while (c == ' '); return i; } -static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +static ssize_t get_labels(const char __user *buffer, int maxlen, struct pktgen_dev *pkt_dev) { unsigned int n = 0; char c; - ssize_t i = 0; - int len; + int i = 0, max, len; pkt_dev->nr_labels = 0; do { __u32 tmp; - len = hex32_arg(&buffer[i], 8, &tmp); - if (len <= 0) + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + + max = min(8, maxlen - i); + len = hex32_arg(&buffer[i], max, &tmp); + if (
[PATCH net-next v2 6/8] net: pktgen: fix access outside of user given buffer in pktgen_thread_write()
Honour the user given buffer size for the strn_len() calls (otherwise strn_len() will access memory outside of the user given buffer). Signed-off-by: Peter Seiderer --- Changes v1 -> v2: - no changes --- net/core/pktgen.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index c8a5b4d17407..9fe2a2db0d34 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1898,8 +1898,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1929,7 +1929,8 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; -- 2.48.1
[PATCH net-next v2 5/8] net: pktgen: fix 'ratep 0' error handling (return -EINVAL)
Given an invalid 'ratep' command e.g. 'ratep 0' the return value is '1', leading to the following misleading output: - the good case $ echo "ratep 100" > /proc/net/pktgen/lo\@0 $ grep "Result:" /proc/net/pktgen/lo\@0 Result: OK: ratep=100 - the bad case (before the patch) $ echo "ratep 0" > /proc/net/pktgen/lo\@0" -bash: echo: write error: Invalid argument $ grep "Result:" /proc/net/pktgen/lo\@0 Result: No such parameter "atep" - with patch applied $ echo "ratep 0" > /proc/net/pktgen/lo\@0 -bash: echo: write error: Invalid argument $ grep "Result:" /proc/net/pktgen/lo\@0 Result: Idle Signed-off-by: Peter Seiderer --- Changes v1 -> v2: - new patch --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 75c7511bf492..c8a5b4d17407 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1130,7 +1130,7 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = NSEC_PER_SEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); -- 2.48.1
[PATCH net-next v2 4/8] net: pktgen: fix 'rate 0' error handling (return -EINVAL)
Given an invalid 'rate' command e.g. 'rate 0' the return value is '1', leading to the following misleading output: - the good case $ echo "rate 100" > /proc/net/pktgen/lo\@0 $ grep "Result:" /proc/net/pktgen/lo\@0 Result: OK: rate=100 - the bad case (before the patch) $ echo "rate 0" > /proc/net/pktgen/lo\@0" -bash: echo: write error: Invalid argument $ grep "Result:" /proc/net/pktgen/lo\@0 Result: No such parameter "ate" - with patch applied $ echo "rate 0" > /proc/net/pktgen/lo\@0 -bash: echo: write error: Invalid argument $ grep "Result:" /proc/net/pktgen/lo\@0 Result: Idle Signed-off-by: Peter Seiderer --- Changes v1 -> v2: - new patch --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 28dbbf70e142..75c7511bf492 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1115,7 +1115,7 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); -- 2.48.1
[PATCH net-next v2 2/8] net: pktgen: enable 'param=value' parsing
Enable additional to 'parm value' the 'param=value' parsing (otherwise skipping '=' in count_trail_chars() is useless). Tested with: $ echo "min_pkt_size 999" > /proc/net/pktgen/lo\@0 $ echo "min_pkt_size=999" > /proc/net/pktgen/lo\@0 $ echo "min_pkt_size =999" > /proc/net/pktgen/lo\@0 $ echo "min_pkt_size= 999" > /proc/net/pktgen/lo\@0 $ echo "min_pkt_size = 999" > /proc/net/pktgen/lo\@0 Signed-off-by: Peter Seiderer --- Changes v1 -> v2: - no changes --- net/core/pktgen.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 496aa16773e7..4f8ec6c9bed4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -823,6 +823,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\r': case '\t': case ' ': + case '=': goto done_str; default: break; -- 2.48.1
[PATCH net-next v2 3/8] net: pktgen: fix hex32_arg parsing for short reads
Fix hex32_arg parsing for short reads (here 7 hex digits instead of the expected 8), shift result only on successful input parsing. - before the patch $ echo "mpls 123" > /proc/net/pktgen/lo\@0 $ grep mpls /proc/net/pktgen/lo\@0 mpls: 1230 Result: OK: mpls=1230 - with patch applied $ echo "mpls 123" > /proc/net/pktgen/lo\@0 $ grep mpls /proc/net/pktgen/lo\@0 mpls: 0123 Result: OK: mpls=0123 Signed-off-by: Peter Seiderer --- Changes v1 -> v2: - new patch --- net/core/pktgen.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 4f8ec6c9bed4..28dbbf70e142 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -753,14 +753,15 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, for (; i < maxlen; i++) { int value; char c; - *num <<= 4; if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); - if (value >= 0) + if (value >= 0) { + *num <<= 4; *num |= value; - else + } else { break; + } } return i; } -- 2.48.1
[PATCH net-next v2 8/8] selftest: net: add proc_net_pktgen
Add some test for /proc/net/pktgen/... interface. - enable 'CONFIG_NET_PKTGEN=m' in tools/testing/selftests/net/config Signed-off-by: Peter Seiderer --- Changes v1 -> v1: - fix tyop not vs. nod (suggested by Jakub Kicinski) - fix misaligned line (suggested by Jakub Kicinski) - enable fomerly commented out CONFIG_XFRM dependent test (command spi), as CONFIG_XFRM is enabled via tools/testing/selftests/net/config CONFIG_XFRM_INTERFACE/CONFIG_XFRM_USER (suggestex by Jakub Kicinski) - add CONFIG_NET_PKTGEN=m to tools/testing/selftests/net/config (suggested by Jakub Kicinski) - add modprobe pktgen to FIXTURE_SETUP() (suggested by Jakub Kicinski) - fix some checkpatch warnings (Missing a blank line after declarations) - shrink line length by re-naming some variables (command -> cmd, device -> dev) - add 'rate 0' testcase - add 'ratep 0' testcase --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config| 1 + tools/testing/selftests/net/proc_net_pktgen.c | 605 ++ 3 files changed, 607 insertions(+) create mode 100644 tools/testing/selftests/net/proc_net_pktgen.c diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 73ee88d6b043..095708cd8345 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -100,6 +100,7 @@ TEST_PROGS += vlan_bridge_binding.sh TEST_PROGS += bpf_offload.py TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_PROGS += busy_poll_test.sh +TEST_GEN_PROGS += proc_net_pktgen # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 5b9baf708950..9fe1b3464fbc 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -107,3 +107,4 @@ CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_NET_PKTGEN=m diff --git a/tools/testing/selftests/net/proc_net_pktgen.c b/tools/testing/selftests/net/proc_net_pktgen.c new file mode 100644 index ..5e2a2926d14a --- /dev/null +++ b/tools/testing/selftests/net/proc_net_pktgen.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * proc_net_pktgen: kselftest for /proc/net/pktgen interface + * + * Copyright (c) 2025 Peter Seiderer + * + */ +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +static const char add_loopback_0[] = "add_device lo@0"; +static const char rm_loopback_0[] = "rem_device_all"; + +static const char wrong_ctrl_cmd[] = "forsureawrongcommand"; +static const char legacy_ctrl_cmd[] = "max_before_softirq"; + +static const char wrong_dev_cmd[] = "forsurewrongcommand"; +static const char dev_cmd_min_pkt_size_0[] = "min_pkt_size"; +static const char dev_cmd_min_pkt_size_1[] = "min_pkt_size "; +static const char dev_cmd_min_pkt_size_2[] = "min_pkt_size 0"; +static const char dev_cmd_min_pkt_size_3[] = "min_pkt_size 1"; +static const char dev_cmd_min_pkt_size_4[] = "min_pkt_size 100"; +static const char dev_cmd_min_pkt_size_5[] = "min_pkt_size=1001"; +static const char dev_cmd_min_pkt_size_6[] = "min_pkt_size =2002"; +static const char dev_cmd_min_pkt_size_7[] = "min_pkt_size= 3003"; +static const char dev_cmd_min_pkt_size_8[] = "min_pkt_size = 4004"; +static const char dev_cmd_max_pkt_size_0[] = "max_pkt_size 200"; +static const char dev_cmd_pkt_size_0[] = "pkt_size 300"; +static const char dev_cmd_imix_weights_0[] = "imix_weights 0,7 576,4 1500,1"; +static const char dev_cmd_imix_weights_1[] = "imix_weights 101,1 102,2 103,3 104,4 105,5 106,6 107,7 108,8 109,9 110,10 111,11 112,12 113,13 114,14 115,15 116,16 117,17 118,18 119,19 120,20"; +static const char dev_cmd_imix_weights_2[] = "imix_weights 100,1 102,2 103,3 104,4 105,5 106,6 107,7 108,8 109,9 110,10 111,11 112,12 113,13 114,14 115,15 116,16 117,17 118,18 119,19 120,20 121,21"; +static const char dev_cmd_debug_0[] = "debug 1"; +static const char dev_cmd_debug_1[] = "debug 0"; +static const char dev_cmd_frags_0[] = "frags 100"; +static const char dev_cmd_delay_0[] = "delay 100"; +static const char dev_cmd_delay_1[] = "delay 2147483647"; +static const char dev_cmd_rate_0[] = "rate 0"; +static const char dev_cmd_rate_1[] = "rate 100"; +static const char dev_cmd_ratep_0[] = "ratep 0"; +static const char dev_cmd_ratep_1[] = "ratep 200"; +static const char dev_cmd_udp_src_min_0[] = "udp_src_min 1"; +static const char dev_cmd_udp_dst_min_0[] = "udp_dst_min 2"; +static const char dev_cmd_udp_src_max_0[] = "udp_src_max 3"; +static const char dev_cmd_udp_dst_max_0[] = "udp_dst_max 4"; +static const char dev_cmd_clone_skb_0[] = "clone_skb 1"; +static const char dev_cmd_clone_skb_1[] = "clone_skb 0"; +static const char dev_cmd_count_0[] = "count 100"; +static const char dev_cmd_src_mac_count_0[] = "src_mac_count 100"; +static const char dev_cmd_dst_m
[PATCH net-next v2 1/8] net: pktgen: replace ENOTSUPP with EOPNOTSUPP
Replace ENOTSUPP with EOPNOTSUPP, fixes checkpatch hint WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP and e.g. $ echo "clone_skb 1" > /proc/net/pktgen/lo\@0 -bash: echo: write error: Unknown error 524 Signed-off-by: Peter Seiderer --- Changes v1 -> v2 - no changes --- net/core/pktgen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 82b6a2c3c141..496aa16773e7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1198,7 +1198,7 @@ static ssize_t pktgen_if_write(struct file *file, if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 0 && (pkt_dev->n_imix_entries > 0 || !(pkt_dev->flags & F_SHARED))) return -EINVAL; @@ -1258,7 +1258,7 @@ static ssize_t pktgen_if_write(struct file *file, ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || ((pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 1 && !(pkt_dev->flags & F_SHARED)) return -EINVAL; @@ -1303,7 +1303,7 @@ static ssize_t pktgen_if_write(struct file *file, } else if (strcmp(f, "netif_receive") == 0) { /* clone_skb set earlier, not supported in this mode */ if (pkt_dev->clone_skb > 0) - return -ENOTSUPP; + return -EOPNOTSUPP; pkt_dev->xmit_mode = M_NETIF_RECEIVE; -- 2.48.1
[PATCH net-next v2 0/8] Some pktgen fixes/improvments
While taking a look at '[PATCH net] pktgen: Avoid out-of-range in get_imix_entries' ([1]) and '[PATCH net v2] pktgen: Avoid out-of-bounds access in get_imix_entries' ([2], [3]) and doing some tests and code review I detected that the /proc/net/pktgen/... parsing logic does not honour the user given buffer bounds (resulting in out-of-bounds access). This can be observed e.g. by the following simple test (sometimes the old/'longer' previous value is re-read from the buffer): $ echo add_device lo@0 > /proc/net/pktgen/kpktgend_0 $ echo "min_pkt_size 12345" > /proc/net/pktgen/lo\@0 && grep min_pkt_size /proc/net/pktgen/lo\@0 Params: count 1000 min_pkt_size: 12345 max_pkt_size: 0 Result: OK: min_pkt_size=12345 $ echo -n "min_pkt_size 123" > /proc/net/pktgen/lo\@0 && grep min_pkt_size /proc/net/pktgen/lo\@0 Params: count 1000 min_pkt_size: 12345 max_pkt_size: 0 Result: OK: min_pkt_size=12345 $ echo "min_pkt_size 123" > /proc/net/pktgen/lo\@0 && grep min_pkt_size /proc/net/pktgen/lo\@0 Params: count 1000 min_pkt_size: 123 max_pkt_size: 0 Result: OK: min_pkt_size=123 So fix the out-of-bounds access (and two minor findings) and add a simple proc_net_pktgen selftest... Regards, Peter Changes v1 -> v2: - new patch: 'net: pktgen: fix hex32_arg parsing for short reads' - new patch: 'net: pktgen: fix 'rate 0' error handling (return -EINVAL)' - new patch: 'net: pktgen: fix 'ratep 0' error handling (return -EINVAL)' - net/core/pktgen.c: additional fix get_imix_entries() and get_labels() - tools/testing/selftests/net/proc_net_pktgen.c: - fix tyop not vs. nod (suggested by Jakub Kicinski) - fix misaligned line (suggested by Jakub Kicinski) - enable fomerly commented out CONFIG_XFRM dependent test (command spi), as CONFIG_XFRM is enabled via tools/testing/selftests/net/config CONFIG_XFRM_INTERFACE/CONFIG_XFRM_USER (suggestex by Jakub Kicinski) - add CONFIG_NET_PKTGEN=m to tools/testing/selftests/net/config (suggested by Jakub Kicinski) - add modprobe pktgen to FIXTURE_SETUP() (suggested by Jakub Kicinski) - fix some checkpatch warnings (Missing a blank line after declarations) - shrink line length by re-naming some variables (command -> cmd, device -> dev) - add 'rate 0' testcase - add 'ratep 0' testcase [1] https://lore.kernel.org/netdev/20241006221221.3744995-1-artem.chernys...@red-soft.ru/ [2] https://lore.kernel.org/netdev/20250109083039.14004-1-pchel...@ispras.ru/ [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76201b5979768500bca362871db66d77cb4c225e Peter Seiderer (8): net: pktgen: replace ENOTSUPP with EOPNOTSUPP net: pktgen: enable 'param=value' parsing net: pktgen: fix hex32_arg parsing for short reads net: pktgen: fix 'rate 0' error handling (return -EINVAL) net: pktgen: fix 'ratep 0' error handling (return -EINVAL) net: pktgen: fix access outside of user given buffer in pktgen_thread_write() net: pktgen: fix access outside of user given buffer in pktgen_if_write() selftest: net: add proc_net_pktgen net/core/pktgen.c | 238 --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config| 1 + tools/testing/selftests/net/proc_net_pktgen.c | 605 ++ 4 files changed, 761 insertions(+), 84 deletions(-) create mode 100644 tools/testing/selftests/net/proc_net_pktgen.c -- 2.48.1
Re: [PATCH v2 0/5] Move kvfree_rcu() into SLAB (v2)
On Tue, Jan 21, 2025 at 3:32 PM Paul E. McKenney wrote: > > On Tue, Jan 21, 2025 at 03:14:16PM +0100, Uladzislau Rezki wrote: > > On Tue, Jan 21, 2025 at 02:49:13PM +0100, Vlastimil Babka wrote: > > > On 1/21/25 2:33 PM, Uladzislau Rezki wrote: > > > > On Mon, Jan 20, 2025 at 11:06:13PM +0100, Vlastimil Babka wrote: > > > >> On 12/16/24 17:46, Paul E. McKenney wrote: > > > >>> On Mon, Dec 16, 2024 at 04:55:06PM +0100, Uladzislau Rezki wrote: > > > On Mon, Dec 16, 2024 at 04:44:41PM +0100, Vlastimil Babka wrote: > > > > On 12/16/24 16:41, Uladzislau Rezki wrote: > > > >> On Mon, Dec 16, 2024 at 03:20:44PM +0100, Vlastimil Babka wrote: > > > >>> On 12/16/24 12:03, Uladzislau Rezki wrote: > > > On Sun, Dec 15, 2024 at 06:30:02PM +0100, Vlastimil Babka wrote: > > > > > > > Also how about a followup patch moving the rcu-tiny > > > > implementation of > > > > kvfree_call_rcu()? > > > > > > > As, Paul already noted, it would make sense. Or just remove a > > > tiny > > > implementation. > > > >>> > > > >>> AFAICS tiny rcu is for !SMP systems. Do they benefit from the > > > >>> "full" > > > >>> implementation with all the batching etc or would that be > > > >>> unnecessary overhead? > > > >>> > > > >> Yes, it is for a really small systems with low amount of memory. I > > > >> see > > > >> only one overhead it is about driving objects in pages. For a small > > > >> system it can be critical because we allocate. > > > >> > > > >> From the other hand, for a tiny variant we can modify the normal > > > >> variant > > > >> by bypassing batching logic, thus do not consume memory(for Tiny > > > >> case) > > > >> i.e. merge it to a normal kvfree_rcu() path. > > > > > > > > Maybe we could change it to use CONFIG_SLUB_TINY as that has > > > > similar use > > > > case (less memory usage on low memory system, tradeoff for worse > > > > performance). > > > > > > > Yep, i also was thinking about that without saying it :) > > > >>> > > > >>> Works for me as well! > > > >> > > > >> Hi, so I tried looking at this. First I just moved the code to slab as > > > >> seen > > > >> in the top-most commit here [1]. Hope the non-inlined > > > >> __kvfree_call_rcu() is > > > >> not a show-stopper here. > > > >> > > > >> Then I wanted to switch the #ifdefs from CONFIG_TINY_RCU to > > > >> CONFIG_SLUB_TINY > > > >> to control whether we use the full blown batching implementation or the > > > >> simple call_rcu() implmentation, and realized it's not straightforward > > > >> and > > > >> reveals there are still some subtle dependencies of kvfree_rcu() on RCU > > > >> internals :) > > > >> > > > >> Problem 1: !CONFIG_SLUB_TINY with CONFIG_TINY_RCU > > > >> > > > >> AFAICS the batching implementation includes > > > >> kfree_rcu_scheduler_running() > > > >> which is called from rcu_set_runtime_mode() but only on TREE_RCU. > > > >> Perhaps > > > >> there are other facilities the batching implementation needs that only > > > >> exists in the TREE_RCU implementation > > > >> > > > >> Possible solution: batching implementation depends on both > > > >> !CONFIG_SLUB_TINY > > > >> and !CONFIG_TINY_RCU. I think it makes sense as both !SMP systems and > > > >> small > > > >> memory systems are fine with the simple implementation. > > > >> > > > >> Problem 2: CONFIG_TREE_RCU with !CONFIG_SLUB_TINY > > > >> > > > >> AFAICS I can't just make the simple implementation do call_rcu() on > > > >> CONFIG_TREE_RCU, because call_rcu() no longer knows how to handle the > > > >> fake > > > >> callback (__is_kvfree_rcu_offset()) - I see how rcu_reclaim_tiny() > > > >> does that > > > >> but no such equivalent exists in TREE_RCU. Am I right? > > > >> > > > >> Possible solution: teach TREE_RCU callback invocation to handle > > > >> __is_kvfree_rcu_offset() again, perhaps hide that branch behind #ifndef > > > >> CONFIG_SLUB_TINY to avoid overhead if the batching implementation is > > > >> used. > > > >> Downside: we visibly demonstrate how kvfree_rcu() is not purely a slab > > > >> thing > > > >> but RCU has to special case it still. > > > >> > > > >> Possible solution 2: instead of the special offset handling, SLUB > > > >> provides a > > > >> callback function, which will determine pointer to the object from the > > > >> pointer to a middle of it without knowing the rcu_head offset. > > > >> Downside: this will have some overhead, but SLUB_TINY is not meant to > > > >> be > > > >> performant anyway so we might not care. > > > >> Upside: we can remove __is_kvfree_rcu_offset() from TINY_RCU as well > > > >> > > > >> Thoughts? > > > >> > > > > For the call_rcu() and to be able to reclaim over it we need to patch > > > > the > > > > tree.c(please note TINY already works): > > > > > > > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > > index b1f883f
Re: [GIT PULL] RCU changes for v6.14
On Tue, Jan 21, 2025 at 02:51:15PM -0800, Linus Torvalds wrote: > On Fri, 17 Jan 2025 at 09:24, Uladzislau Rezki (Sony) > wrote: > > > > Please pull the latest RCU git tree from: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/rcu/linux.git > > tags/rcu.release.v6.14 > > Hmm. I have pulled this, and the resulting shortlog matches what you > say I should have gotten. > > But your diffstat is odd and looks very wrong. > > Things like this: > > > tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c | 395 > > + > > were already in my tree from before, and aren't new (and your shortlog > doesn't mention them). > > Also, look at that line. It's long. It's not supposed to be that long. > You've presumably generated this (incorrect) diffstat in a very wide > terminal, and then cut-and-pasted that very wide output into the > email. > > Please don't do that. Yes, git will default to wide output if the > output is a wide terminal, but if you either pipe it into something > else (may I suggest "xsel", so that it goes directly into your X > clipboard). > > Or use "--stat=80" to limit the width manually to 80 columns. > > But the strange formatting is the less worrisome issue. The diffstat > just being plain wrong is the more alarming thing. > Thank you for taking this in! My script that generates a diff-stat is not perfect, i will fix it. Sorry this is my bad. -- Uladzislau Rezki
Re: [PATCH v4 4/4] rust: add parameter support to the `module!` macro
On 1/9/25 11:54, Andreas Hindborg wrote: > This patch includes changes required for Rust kernel modules to utilize > module parameters. This code implements read only support for integer > types without `sysfs` support. > > Signed-off-by: Andreas Hindborg > --- > rust/kernel/lib.rs | 1 + > rust/kernel/module_param.rs | 225 > +++ > rust/macros/helpers.rs | 14 +++ > rust/macros/lib.rs | 31 ++ > rust/macros/module.rs| 188 > samples/rust/rust_minimal.rs | 10 ++ > 6 files changed, 451 insertions(+), 18 deletions(-) > > diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs > index > 4253c9a7fe7df935dc714bc3036d299bd80054a0..707ec81411c7aecccdd14e02cbf011642bc02e07 > 100644 > --- a/rust/kernel/lib.rs > +++ b/rust/kernel/lib.rs > @@ -46,6 +46,7 @@ > pub mod kunit; > pub mod list; > pub mod miscdevice; > +pub mod module_param; > #[cfg(CONFIG_NET)] > pub mod net; > pub mod once_lite; > diff --git a/rust/kernel/module_param.rs b/rust/kernel/module_param.rs > new file mode 100644 > index > ..6cb64096090a37c4f9c4ce73e23b7ac049e32d20 > --- /dev/null > +++ b/rust/kernel/module_param.rs > @@ -0,0 +1,225 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +//! Types for module parameters. > +//! > +//! C header: > [`include/linux/moduleparam.h`](srctree/include/linux/moduleparam.h) > + > +use crate::prelude::*; > +use crate::str::BStr; > + > +/// Newtype to make `bindings::kernel_param` [`Sync`]. > +#[repr(transparent)] > +#[doc(hidden)] > +pub struct RacyKernelParam(pub ::kernel::bindings::kernel_param); > + > +// SAFETY: C kernel handles serializing access to this type. We never access > +// from Rust module. > +unsafe impl Sync for RacyKernelParam {} > + > +/// Types that can be used for module parameters. > +/// > +/// Note that displaying the type in `sysfs` will fail if > +/// [`Display`](core::fmt::Display) implementation would write more than > +/// [`PAGE_SIZE`] - 1 bytes. > +/// > +/// [`PAGE_SIZE`]: `bindings::PAGE_SIZE` > +pub trait ModuleParam: Sized { > +/// The [`ModuleParam`] will be used by the kernel module through this > type. > +/// > +/// This may differ from `Self` if, for example, `Self` needs to track > +/// ownership without exposing it or allocate extra space for other > possible > +/// parameter values. > +// This is required to support string parameters in the future. > +type Value: ?Sized; > + > +/// Parse a parameter argument into the parameter value. > +/// > +/// `Err(_)` should be returned when parsing of the argument fails. > +/// > +/// Parameters passed at boot time will be set before [`kmalloc`] is > +/// available (even if the module is loaded at a later time). However, in > +/// this case, the argument buffer will be valid for the entire lifetime > of > +/// the kernel. So implementations of this method which need to allocate > +/// should first check that the allocator is available (with > +/// [`crate::bindings::slab_is_available`]) and when it is not available > +/// provide an alternative implementation which doesn't allocate. In > cases > +/// where the allocator is not available it is safe to save references to > +/// `arg` in `Self`, but in other cases a copy should be made. > +/// > +/// [`kmalloc`]: srctree/include/linux/slab.h > +fn try_from_param_arg(arg: &'static [u8]) -> Result; > +} > + > +/// Set the module parameter from a string. > +/// > +/// Used to set the parameter value at kernel initialization, when loading > +/// the module or when set through `sysfs`. > +/// > +/// `param.arg` is a pointer to `*mut T` as set up by the [`module!`] > +/// macro. > +/// > +/// See `struct kernel_param_ops.set`. > +/// > +/// # Safety > +/// > +/// If `val` is non-null then it must point to a valid null-terminated > +/// string. The `arg` field of `param` must be an instance of `T`. > +/// > +/// # Invariants > +/// > +/// Currently, we only support read-only parameters that are not readable > +/// from `sysfs`. Thus, this function is only called at kernel > +/// initialization time, or at module load time, and we have exclusive > +/// access to the parameter for the duration of the function. > +/// > +/// [`module!`]: macros::module > +unsafe extern "C" fn set_param( > +val: *const core::ffi::c_char, > +param: *const crate::bindings::kernel_param, > +) -> core::ffi::c_int > +where > +T: ModuleParam, > +{ > +// NOTE: If we start supporting arguments without values, val _is_ > allowed > +// to be null here. > +if val.is_null() { > +crate::pr_warn_once!("Null pointer passed to > `module_param::set_param`"); > +return crate::error::code::EINVAL.to_errno(); > +} > + > +// SAFETY: By function safety requirement, val is non-null and > +// null-terminated. By C API c
[PATCH v2 1/3] selftests/mm: make file-backed THP split work by writing PMD size data
Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") changes huge=always to allocate THP/mTHP based on write size and split_huge_page_test does not write PMD size data, so file-back THP is not created during the test. Fix it by writing PMD size data. Signed-off-by: Zi Yan --- V1 -> V2: write PMD size data instead of setting /sys/kernel/mm/transparent_hugepage/shmem_enabled to "force". .../selftests/mm/split_huge_page_test.c | 52 --- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 3f353f3d070f..ba498aaaf857 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -265,14 +265,28 @@ void split_file_backed_thp(void) { int status; int fd; - ssize_t num_written; char tmpfs_template[] = "/tmp/thp_split_XX"; const char *tmpfs_loc = mkdtemp(tmpfs_template); char testfile[INPUT_MAX]; + ssize_t num_written, num_read; + char *file_buf1, *file_buf2; uint64_t pgoff_start = 0, pgoff_end = 1024; + int i; ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n"); + file_buf1 = (char *)malloc(pmd_pagesize); + file_buf2 = (char *)malloc(pmd_pagesize); + + if (!file_buf1 || !file_buf2) { + ksft_print_msg("cannot allocate file buffers\n"); + goto out; + } + + for (i = 0; i < pmd_pagesize; i++) + file_buf1[i] = (char)i; + memset(file_buf2, 0, pmd_pagesize); + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); if (status) @@ -281,26 +295,45 @@ void split_file_backed_thp(void) status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); if (status >= INPUT_MAX) { ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); + goto cleanup; } - fd = open(testfile, O_CREAT|O_WRONLY, 0664); + fd = open(testfile, O_CREAT|O_RDWR, 0664); if (fd == -1) { ksft_perror("Cannot open testing file"); goto cleanup; } - /* write something to the file, so a file-backed THP can be allocated */ - num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); - close(fd); + /* write pmd size data to the file, so a file-backed THP can be allocated */ + num_written = write(fd, file_buf1, pmd_pagesize); - if (num_written < 1) { - ksft_perror("Fail to write data to testing file"); - goto cleanup; + if (num_written == -1 || num_written != pmd_pagesize) { + ksft_perror("Failed to write data to testing file"); + goto close_file; } /* split the file-backed THP */ write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); + /* check file content after split */ + status = lseek(fd, 0, SEEK_SET); + if (status == -1) { + ksft_perror("Cannot lseek file"); + goto close_file; + } + + num_read = read(fd, file_buf2, num_written); + if (num_read == -1 || num_read != num_written) { + ksft_perror("Cannot read file content back"); + goto close_file; + } + + if (strncmp(file_buf1, file_buf2, pmd_pagesize) != 0) { + ksft_print_msg("File content changed\n"); + goto close_file; + } + + close(fd); status = unlink(testfile); if (status) { ksft_perror("Cannot remove testing file"); @@ -321,9 +354,12 @@ void split_file_backed_thp(void) ksft_test_result_pass("File-backed THP split test done\n"); return; +close_file: + close(fd); cleanup: umount(tmpfs_loc); rmdir(tmpfs_loc); +out: ksft_exit_fail_msg("Error occurred\n"); } -- 2.45.2
[PATCH v2 2/3] mm/huge_memory: allow split shmem large folio to any lower order
Commit 4d684b5f92ba ("mm: shmem: add large folio support for tmpfs") has added large folio support to shmem. Remove the restriction in split_huge_page*(). Signed-off-by: Zi Yan Reviewed-by: Baolin Wang --- mm/huge_memory.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..deb4e72daeb9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3299,7 +3299,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Some pages can be beyond EOF: drop them from page cache */ if (tail->index >= end) { if (shmem_mapping(folio->mapping)) - nr_dropped++; + nr_dropped += new_nr; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); @@ -3465,12 +3465,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, return -EINVAL; } } else if (new_order) { - /* Split shmem folio to non-zero order not supported */ - if (shmem_mapping(folio->mapping)) { - VM_WARN_ONCE(1, - "Cannot split shmem folio to non-0 order"); - return -EINVAL; - } /* * No split if the file system does not support large folio. * Note that we might still have THPs in such mappings due to -- 2.45.2
[PATCH v2 3/3] selftests/mm: test splitting file-backed THP to any lower order.
Now split_huge_page*() supports shmem THP split to any lower order. Test it. The test now reads file content out after split to check if the split corrupts the file data. Signed-off-by: Zi Yan Reviewed-by: Baolin Wang Tested-by: Baolin Wang --- tools/testing/selftests/mm/split_huge_page_test.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index ba498aaaf857..e0304046b1a0 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -261,7 +261,7 @@ void split_pte_mapped_thp(void) close(kpageflags_fd); } -void split_file_backed_thp(void) +void split_file_backed_thp(int order) { int status; int fd; @@ -313,7 +313,7 @@ void split_file_backed_thp(void) } /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, order); /* check file content after split */ status = lseek(fd, 0, SEEK_SET); @@ -351,7 +351,7 @@ void split_file_backed_thp(void) ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno)); ksft_print_msg("Please check dmesg for more information\n"); - ksft_test_result_pass("File-backed THP split test done\n"); + ksft_test_result_pass("File-backed THP split to order %d test done\n", order); return; close_file: @@ -517,7 +517,7 @@ int main(int argc, char **argv) if (argc > 1) optional_xfs_path = argv[1]; - ksft_set_plan(1+8+2+9); + ksft_set_plan(1+8+1+9+9); pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; @@ -534,7 +534,8 @@ int main(int argc, char **argv) split_pmd_thp_to_order(i); split_pte_mapped_thp(); - split_file_backed_thp(); + for (i = 0; i < 9; i++) + split_file_backed_thp(i); created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, &fs_loc); -- 2.45.2
[PATCHv2 net] Bonding: Fix support for gso_partial_features
The fixed commit adds NETIF_F_GSO_ESP bit for bonding gso_partial_features. However, if we don't set the dev NETIF_F_GSO_PARTIAL bit, the later netdev_change_features() -> netdev_fix_features() will remove the NETIF_F_GSO_ESP bit from the dev features. This causes ethtool to show that the bond does not support tx-esp-segmentation. For example # ethtool -k bond0 | grep esp tx-esp-segmentation: off [requested on] esp-hw-offload: on esp-tx-csum-hw-offload: on Add the NETIF_F_GSO_PARTIAL bit to bond dev features when set gso_partial_features to fix this issue. Fixes: 4861333b4217 ("bonding: add ESP offload features when slaves support") Reported-by: Liang Li Signed-off-by: Hangbin Liu --- v2: remove NETIF_F_GSO_PARTIAL bit if not set gso_partial_features. The issue is reported internally, so there is no Closes tag. BTW, I saw some drivers set NETIF_F_GSO_PARTIAL on dev->features. Some other drivers set NETIF_F_GSO_PARTIAL on dev->hw_enc_features. I haven't see a doc about where we should set. So I just set it on dev->features. --- drivers/net/bonding/bond_main.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 7b78c2bada81..09d5a8433d86 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1598,10 +1598,13 @@ static void bond_compute_features(struct bonding *bond) } bond_dev->hard_header_len = max_hard_header_len; - if (gso_partial_features & NETIF_F_GSO_ESP) + if (gso_partial_features & NETIF_F_GSO_ESP) { bond_dev->gso_partial_features |= NETIF_F_GSO_ESP; - else + bond_dev->features |= NETIF_F_GSO_PARTIAL; + } else { bond_dev->gso_partial_features &= ~NETIF_F_GSO_ESP; + bond_dev->features &= ~NETIF_F_GSO_PARTIAL; + } done: bond_dev->vlan_features = vlan_features; -- 2.39.5 (Apple Git-154)
[RFC 1/5] vduse: add virtio_fs to allowed dev id
A VDUSE device that implements virtiofs device works fine just by adding the device id to the whitelist. Signed-off-by: Eugenio Pérez --- drivers/vdpa/vdpa_user/vduse_dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 7ae99691efdf..6a9a37351310 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -144,6 +144,7 @@ static struct workqueue_struct *vduse_irq_bound_wq; static u32 allowed_device_id[] = { VIRTIO_ID_BLOCK, VIRTIO_ID_NET, + VIRTIO_ID_FS, }; static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa) -- 2.48.1
[RFC 0/5] virtiofs: map buffer out of virtqueue lock
This is useful for some setups like swiotlb or VDUSE where the DMA operations are expensive and/or need to be performed with a write lock. After applying this patch, fio read test goes from 1201MiB/s to 1211MiB/s. --- Sending this series to obtain feedback if this is the right way to go. Q: Do we need virtqueue_map_sgs? XDP directly call dma_map. Need to profile. TODO: Profile multiqueue. TODO: Handling errors. TODO: Do the same for hiprio queue. TODO: Can we batch maps? virtiofs always sends many buffers. Eugenio Pérez (4): vduse: add virtio_fs to allowed dev id virtiofs: Move stack sg to fuse_req virtio_ring: add virtqueue premapped out/in buffers support virtiofs: perform DMA operations out of the spinlock Jason Wang (1): virtio_ring: introduce virtqueue_map/unmap_sgs() drivers/vdpa/vdpa_user/vduse_dev.c | 1 + drivers/virtio/virtio_ring.c | 165 ++--- fs/fuse/fuse_i.h | 7 ++ fs/fuse/virtio_fs.c| 68 +++- include/linux/virtio.h | 17 +++ 5 files changed, 220 insertions(+), 38 deletions(-) -- 2.48.1
[RFC 3/5] virtiofs: Move stack sg to fuse_req
We need to move the map and unmap out of virtiofs queue lock. We need to store the unmap sgs to call virtqueue_unmap_sgs, so use the request itself. TODO: We need to store the forget sgs too. In my tests it is not called so we're safe. Find a way to force-call it. Signed-off-by: Eugenio Pérez --- fs/fuse/fuse_i.h| 7 +++ fs/fuse/virtio_fs.c | 45 + 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74744c6f2860..e57664daa761 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -434,6 +435,12 @@ struct fuse_req { #if IS_ENABLED(CONFIG_VIRTIO_FS) /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; + + /** virtio-fs's pre-mapped stuff */ + struct scatterlist sg_inline_data[6]; /* optimization for short requests */ + struct scatterlist *sg; + unsigned int out_sgs; + unsigned int in_sgs; #endif /** fuse_mount this request belongs to */ diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 82afe78ec542..1344c5782a7c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1377,14 +1377,10 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; - struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; struct scatterlist **sgs = stack_sgs; - struct scatterlist *sg = stack_sg; struct virtqueue *vq; struct fuse_args *args = req->args; unsigned int argbuf_used = 0; - unsigned int out_sgs = 0; - unsigned int in_sgs = 0; unsigned int total_sgs; unsigned int i; int ret; @@ -1392,11 +1388,13 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_pqueue *fpq; /* Does the sglist fit on the stack? */ + /* TODO replace magic 6 by a macro */ + req->sg = req->sg_inline_data; total_sgs = sg_count_fuse_req(req); - if (total_sgs > ARRAY_SIZE(stack_sgs)) { + if (total_sgs > 6) { sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); - sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); - if (!sgs || !sg) { + req->sg = kmalloc_array(total_sgs, sizeof(req->sg[0]), gfp); + if (!sgs || !req->sg) { ret = -ENOMEM; goto out; } @@ -1408,26 +1406,25 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, goto out; /* Request elements */ - sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); - out_sgs += sg_init_fuse_args(&sg[out_sgs], req, -(struct fuse_arg *)args->in_args, -args->in_numargs, args->in_pages, -req->argbuf, &argbuf_used); + sg_init_one(&req->sg[req->out_sgs++], &req->in.h, sizeof(req->in.h)); + req->out_sgs += sg_init_fuse_args(&req->sg[req->out_sgs], req, + (struct fuse_arg *)args->in_args, + args->in_numargs, args->in_pages, + req->argbuf, &argbuf_used); /* Reply elements */ if (test_bit(FR_ISREPLY, &req->flags)) { - sg_init_one(&sg[out_sgs + in_sgs++], + sg_init_one(&req->sg[req->out_sgs + req->in_sgs++], &req->out.h, sizeof(req->out.h)); - in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, - args->out_args, args->out_numargs, - args->out_pages, - req->argbuf + argbuf_used, NULL); + req->in_sgs += sg_init_fuse_args(&req->sg[req->out_sgs + req->in_sgs], req, +args->out_args, args->out_numargs, +args->out_pages, +req->argbuf + argbuf_used, NULL); } - WARN_ON(out_sgs + in_sgs != total_sgs); - for (i = 0; i < total_sgs; i++) - sgs[i] = &sg[i]; + sgs[i] = &req->sg[i]; + WARN_ON(req->out_sgs + req->in_sgs != total_sgs); spin_lock(&fsvq->lock); @@ -1438,7 +1435,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, } vq = fsvq->vq; - ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); + ret = virtqueue_add_sgs(vq, sgs, req->out_sgs, req->in_sgs, req, GFP_ATOMIC); if (ret < 0) { spin_unlock(&fsvq->lock); goto out; @@ -146
[RFC 4/5] virtio_ring: add virtqueue premapped out/in buffers support
Convenience function to add chains that mixes out and in buffers. Signed-off-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 35 +++ include/linux/virtio.h | 7 +++ 2 files changed, 42 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 05729bc5cbb1..e49912fa77c5 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2385,6 +2385,41 @@ static inline int virtqueue_add(struct virtqueue *_vq, out_sgs, in_sgs, data, ctx, premapped, gfp); } +/** + * virtqueue_add_sgs_premapped - expose buffers to other end + * @_vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_sgs: the number of scatterlists readable by other side + * @in_sgs: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_sgs_premapped(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_sg = 0; + + /* Count them first. */ + for (i = 0; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_sg++; + } + return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, +data, NULL, true, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs_premapped); + /** * virtqueue_add_sgs - expose buffers to other end * @_vq: the struct virtqueue we're talking about. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 28db998d691e..de52a0f6f734 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -84,6 +84,13 @@ int virtqueue_add_sgs(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_sgs_premapped(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp); + struct device *virtqueue_dma_dev(struct virtqueue *vq); bool virtqueue_kick(struct virtqueue *vq); -- 2.48.1
[RFC 5/5] virtiofs: perform DMA operations out of the spinlock
This is useful for some setups like swiotlb or VDUSE where the DMA operations are expensive and/or need to be performed with a write lock. After applying this patch, fio read test goes from 1201MiB/s to 1211MiB/s. Signed-off-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 2 ++ fs/fuse/virtio_fs.c | 25 +++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index e49912fa77c5..eb22bfcb9100 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -580,6 +580,7 @@ int virtqueue_map_sgs(struct virtqueue *_vq, goto unmap_release; sg_dma_address(sg) = addr; + sg_dma_len(sg) = sg->length; mapped_sg++; } } @@ -592,6 +593,7 @@ int virtqueue_map_sgs(struct virtqueue *_vq, goto unmap_release; sg_dma_address(sg) = addr; + sg_dma_len(sg) = sg->length; mapped_sg++; } } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 1344c5782a7c..2b558b05d0f8 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -836,8 +836,21 @@ static void virtio_fs_requests_done_work(struct work_struct *work) /* End requests */ list_for_each_entry_safe(req, next, &reqs, list) { + struct scatterlist *stack_sgs[6]; + struct scatterlist **sgs = stack_sgs; + unsigned int total_sgs = req->out_sgs + req->in_sgs; + list_del_init(&req->list); + /* TODO replace magic 6 by a macro */ + if (total_sgs > 6) + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); + + for (unsigned int i = 0; i < total_sgs; ++i) + sgs[i] = &req->sg[i]; + + virtqueue_unmap_sgs(vq, sgs, req->out_sgs, req->in_sgs); + /* blocking async request completes in a worker context */ if (req->args->may_block) { struct virtio_fs_req_work *w; @@ -850,6 +863,9 @@ static void virtio_fs_requests_done_work(struct work_struct *work) } else { virtio_fs_request_complete(req, fsvq); } + + if (sgs != stack_sgs) + kfree(sgs); } /* Try to push previously queued requests, as the queue might no longer be full */ @@ -1426,6 +1442,11 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, sgs[i] = &req->sg[i]; WARN_ON(req->out_sgs + req->in_sgs != total_sgs); + // TODO can we change this ptr out of the lock? + vq = fsvq->vq; + // TODO handle this and following errors + ret = virtqueue_map_sgs(vq, sgs, req->out_sgs, req->in_sgs); + BUG_ON(ret < 0); spin_lock(&fsvq->lock); if (!fsvq->connected) { @@ -1434,8 +1455,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, goto out; } - vq = fsvq->vq; - ret = virtqueue_add_sgs(vq, sgs, req->out_sgs, req->in_sgs, req, GFP_ATOMIC); + ret = virtqueue_add_sgs_premapped(vq, sgs, req->out_sgs, + req->in_sgs, req, GFP_ATOMIC); if (ret < 0) { spin_unlock(&fsvq->lock); goto out; -- 2.48.1
Re: [PATCH 1/3] selftests/mm: make file-backed THP split work by setting force option
On Wed Jan 22, 2025 at 10:27 AM EST, David Hildenbrand wrote: > On 22.01.25 16:16, Zi Yan wrote: >> On Wed Jan 22, 2025 at 9:26 AM EST, David Hildenbrand wrote: >>> On 22.01.25 13:40, Zi Yan wrote: Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") changes huge=always to allocate THP/mTHP based on write size and split_huge_page_test does not write PMD size data, so file-back THP is not created during the test. >>> >>> Just curious, why can't we write PMD size data instead, to avoid messing >>> with the "force" option? >> >> It also works. I used "force", because I notice that it is intended for >> testing. Using it might be more future proof, in case huge=always changes >> its semantics again in the future. > > I recall discussing with Hugh in an upstream call that "force" is a > relict from older times, so naturally I would have just adjusted the > test case to trigger the PMD scenario. No strong opinion, though, was > just wondering. Got it. Let me change it and resend. Thank you for the feedback. -- Best Regards, Yan, Zi
Re: [PATCH] vduse: add virtio_fs to allowed dev id
On Tue, Jan 21, 2025 at 11:33:46AM +0100, Eugenio Pérez wrote: > A VDUSE device that implements virtiofs device works fine just by > adding the device id to the whitelist. The virtiofs DAX Window feature might require extra work. It is not widely enabled though, so must virtiofs devices will work fine with just virtqueue and configuration space support. Reviewed-by: Stefan Hajnoczi signature.asc Description: PGP signature
Re: [PATCH v5 01/10] selftests/mm: make file-backed THP split work by setting force option
On Wed Jan 22, 2025 at 1:32 AM EST, Baolin Wang wrote: > > > On 2025/1/17 05:10, Zi Yan wrote: >> Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") >> changes huge=always to allocate THP/mTHP based on write size and >> split_huge_page_test does not write PMD size data, so file-back THP is not >> created during the test. >> >> Set /sys/kernel/mm/transparent_hugepage/shmem_enabled to "force" to force >> THP allocation. >> >> Signed-off-by: Zi Yan >> --- >> .../selftests/mm/split_huge_page_test.c | 47 +-- >> 1 file changed, 43 insertions(+), 4 deletions(-) >> >> diff --git a/tools/testing/selftests/mm/split_huge_page_test.c >> b/tools/testing/selftests/mm/split_huge_page_test.c >> index 3f353f3d070f..eccaa347140b 100644 >> --- a/tools/testing/selftests/mm/split_huge_page_test.c >> +++ b/tools/testing/selftests/mm/split_huge_page_test.c >> @@ -264,15 +264,46 @@ void split_pte_mapped_thp(void) >> void split_file_backed_thp(void) >> { >> int status; >> -int fd; >> -ssize_t num_written; >> +int fd, shmem_sysctl_fd; >> +ssize_t num_written, num_read; >> char tmpfs_template[] = "/tmp/thp_split_XX"; >> const char *tmpfs_loc = mkdtemp(tmpfs_template); >> -char testfile[INPUT_MAX]; >> +char testfile[INPUT_MAX], sysctl_buf[INPUT_MAX] = {0}; >> uint64_t pgoff_start = 0, pgoff_end = 1024; >> +const char *shmem_sysctl = >> "/sys/kernel/mm/transparent_hugepage/shmem_enabled"; >> +char *opt1, *opt2; >> >> ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() >> for more info.\n"); >> >> +shmem_sysctl_fd = open(shmem_sysctl, O_RDWR); >> +if (shmem_sysctl_fd == -1) { >> +ksft_perror("cannot open shmem sysctl"); >> +goto out; >> +} >> + >> +num_read = read(shmem_sysctl_fd, sysctl_buf, INPUT_MAX); >> +if (num_read < 1) { >> +ksft_perror("Failed to read shmem sysctl"); > > You should close() shmem_sysctl_fd before returning. Ack. > >> +goto out; >> +} >> + >> +opt1 = strchr(sysctl_buf, '['); >> +opt2 = strchr(sysctl_buf, ']'); >> +if (!opt1 || !opt2) { > > Ditto. Ack. > >> +ksft_perror("cannot read shmem sysctl config"); >> +goto out; >> +} >> + >> +/* get existing shmem sysctl config into sysctl_buf */ >> +strncpy(sysctl_buf, opt1 + 1, opt2 - opt1 - 1); >> +memset(sysctl_buf + (opt2 - opt1 - 1), 0, INPUT_MAX); >> + >> +num_written = write(shmem_sysctl_fd, "force", sizeof("force")); >> +if (num_written < 1) { >> +ksft_perror("Fail to write force to shmem sysctl"); >> +goto out; >> +} >> + >> status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); >> >> if (status) >> @@ -317,13 +348,21 @@ void split_file_backed_thp(void) >> if (status) >> ksft_exit_fail_msg("cannot remove tmp dir: %s\n", >> strerror(errno)); >> >> +num_written = write(shmem_sysctl_fd, sysctl_buf, strlen(sysctl_buf) + >> 1); >> +if (num_written < 1) >> +ksft_perror("Fail to restore shmem sysctl"); >> + >> ksft_print_msg("Please check dmesg for more information\n"); >> -ksft_test_result_pass("File-backed THP split test done\n"); >> +ksft_test_result_pass("File-backed THP split to order %d test done\n", >> order); > > Seems the patch set split has issues. No 'order' variable in this patch. > > Anyway, I've fixed these issues in my local tree, and it works well. If > you fix them in the next version, please feel free to add: > Reviewed-by: Baolin Wang > Tested-by: Baolin Wang Sure. Thank you for the review and testing. I will fix the above issues and resend another version. First three patches can be merged first. > > >> return; >> >> cleanup: >> +num_written = write(shmem_sysctl_fd, sysctl_buf, strlen(sysctl_buf) + >> 1); >> +if (num_written < 1) >> +ksft_perror("Fail to restore shmem sysctl"); >> umount(tmpfs_loc); >> rmdir(tmpfs_loc); >> +out: >> ksft_exit_fail_msg("Error occurred\n"); >> } >> -- Best Regards, Yan, Zi
Re: [PATCH 1/3] selftests/mm: make file-backed THP split work by setting force option
On 22.01.25 16:16, Zi Yan wrote: On Wed Jan 22, 2025 at 9:26 AM EST, David Hildenbrand wrote: On 22.01.25 13:40, Zi Yan wrote: Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") changes huge=always to allocate THP/mTHP based on write size and split_huge_page_test does not write PMD size data, so file-back THP is not created during the test. Just curious, why can't we write PMD size data instead, to avoid messing with the "force" option? It also works. I used "force", because I notice that it is intended for testing. Using it might be more future proof, in case huge=always changes its semantics again in the future. I recall discussing with Hugh in an upstream call that "force" is a relict from older times, so naturally I would have just adjusted the test case to trigger the PMD scenario. No strong opinion, though, was just wondering. -- Cheers, David / dhildenb
[PATCH 3/3] selftests/mm: test splitting file-backed THP to any lower order.
Now split_huge_page*() supports shmem THP split to any lower order. Test it. The test now reads file content out after split to check if the split corrupts the file data. Signed-off-by: Zi Yan Reviewed-by: Baolin Wang Tested-by: Baolin Wang --- .../selftests/mm/split_huge_page_test.c | 32 ++- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 8e15fc9dce3a..13ad5ac7d178 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -261,14 +261,16 @@ void split_pte_mapped_thp(void) close(kpageflags_fd); } -void split_file_backed_thp(void) +void split_file_backed_thp(int order) { int status; int fd, shmem_sysctl_fd; ssize_t num_written, num_read; char tmpfs_template[] = "/tmp/thp_split_XX"; const char *tmpfs_loc = mkdtemp(tmpfs_template); - char testfile[INPUT_MAX], sysctl_buf[INPUT_MAX] = {0}; + char testfile[INPUT_MAX]; + char sysctl_buf[INPUT_MAX] = {0}; + char file_buf[INPUT_MAX] = {0}; uint64_t pgoff_start = 0, pgoff_end = 1024; const char *shmem_sysctl = "/sys/kernel/mm/transparent_hugepage/shmem_enabled"; char *opt1, *opt2; @@ -314,7 +316,7 @@ void split_file_backed_thp(void) ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); } - fd = open(testfile, O_CREAT|O_WRONLY, 0664); + fd = open(testfile, O_CREAT|O_RDWR, 0664); if (fd == -1) { ksft_perror("Cannot open testing file"); goto cleanup; @@ -322,7 +324,6 @@ void split_file_backed_thp(void) /* write something to the file, so a file-backed THP can be allocated */ num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); - close(fd); if (num_written < 1) { ksft_perror("Fail to write data to testing file"); @@ -330,8 +331,22 @@ void split_file_backed_thp(void) } /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, order); + + /* check file content after split */ + num_read = lseek(fd, 0, SEEK_SET); + if (num_read == -1) { + ksft_perror("Cannot lseek file"); + goto cleanup; + } + num_read = read(fd, file_buf, num_written); + if (num_read < 1 || strncmp(file_buf, tmpfs_loc, num_read)) { + ksft_print_msg("File content changed, origin: %s, now: %s\n", tmpfs_loc, file_buf); + goto cleanup; + } + + close(fd); status = unlink(testfile); if (status) { ksft_perror("Cannot remove testing file"); @@ -354,7 +369,7 @@ void split_file_backed_thp(void) close(shmem_sysctl_fd); ksft_print_msg("Please check dmesg for more information\n"); - ksft_test_result_pass("File-backed THP split test done\n"); + ksft_test_result_pass("File-backed THP split to order %d test done\n", order); return; cleanup: @@ -523,7 +538,7 @@ int main(int argc, char **argv) if (argc > 1) optional_xfs_path = argv[1]; - ksft_set_plan(1+8+2+9); + ksft_set_plan(1+8+1+9+9); pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; @@ -540,7 +555,8 @@ int main(int argc, char **argv) split_pmd_thp_to_order(i); split_pte_mapped_thp(); - split_file_backed_thp(); + for (i = 0; i < 9; i++) + split_file_backed_thp(i); created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, &fs_loc); -- 2.45.2
[PATCH 2/3] mm/huge_memory: allow split shmem large folio to any lower order
Commit 4d684b5f92ba ("mm: shmem: add large folio support for tmpfs") has added large folio support to shmem. Remove the restriction in split_huge_page*(). Signed-off-by: Zi Yan Reviewed-by: Baolin Wang --- mm/huge_memory.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..deb4e72daeb9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3299,7 +3299,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Some pages can be beyond EOF: drop them from page cache */ if (tail->index >= end) { if (shmem_mapping(folio->mapping)) - nr_dropped++; + nr_dropped += new_nr; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); @@ -3465,12 +3465,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, return -EINVAL; } } else if (new_order) { - /* Split shmem folio to non-zero order not supported */ - if (shmem_mapping(folio->mapping)) { - VM_WARN_ONCE(1, - "Cannot split shmem folio to non-0 order"); - return -EINVAL; - } /* * No split if the file system does not support large folio. * Note that we might still have THPs in such mappings due to -- 2.45.2
[PATCH 1/3] selftests/mm: make file-backed THP split work by setting force option
Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") changes huge=always to allocate THP/mTHP based on write size and split_huge_page_test does not write PMD size data, so file-back THP is not created during the test. Set /sys/kernel/mm/transparent_hugepage/shmem_enabled to "force" to force THP allocation. Signed-off-by: Zi Yan Reviewed-by: Baolin Wang Tested-by: Baolin Wang --- .../selftests/mm/split_huge_page_test.c | 48 +-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 3f353f3d070f..8e15fc9dce3a 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -264,15 +264,46 @@ void split_pte_mapped_thp(void) void split_file_backed_thp(void) { int status; - int fd; - ssize_t num_written; + int fd, shmem_sysctl_fd; + ssize_t num_written, num_read; char tmpfs_template[] = "/tmp/thp_split_XX"; const char *tmpfs_loc = mkdtemp(tmpfs_template); - char testfile[INPUT_MAX]; + char testfile[INPUT_MAX], sysctl_buf[INPUT_MAX] = {0}; uint64_t pgoff_start = 0, pgoff_end = 1024; + const char *shmem_sysctl = "/sys/kernel/mm/transparent_hugepage/shmem_enabled"; + char *opt1, *opt2; ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n"); + shmem_sysctl_fd = open(shmem_sysctl, O_RDWR); + if (shmem_sysctl_fd == -1) { + ksft_perror("cannot open shmem sysctl"); + goto out; + } + + num_read = read(shmem_sysctl_fd, sysctl_buf, INPUT_MAX); + if (num_read < 1) { + ksft_perror("Failed to read shmem sysctl"); + goto cleanup_sysctl; + } + + opt1 = strchr(sysctl_buf, '['); + opt2 = strchr(sysctl_buf, ']'); + if (!opt1 || !opt2) { + ksft_perror("cannot read shmem sysctl config"); + goto cleanup_sysctl; + } + + /* get existing shmem sysctl config into sysctl_buf */ + strncpy(sysctl_buf, opt1 + 1, opt2 - opt1 - 1); + memset(sysctl_buf + (opt2 - opt1 - 1), 0, INPUT_MAX); + + num_written = write(shmem_sysctl_fd, "force", sizeof("force")); + if (num_written < 1) { + ksft_perror("Fail to write force to shmem sysctl"); + goto cleanup_sysctl; + } + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); if (status) @@ -317,13 +348,24 @@ void split_file_backed_thp(void) if (status) ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno)); + num_written = write(shmem_sysctl_fd, sysctl_buf, strlen(sysctl_buf) + 1); + if (num_written < 1) + ksft_perror("Fail to restore shmem sysctl"); + + close(shmem_sysctl_fd); ksft_print_msg("Please check dmesg for more information\n"); ksft_test_result_pass("File-backed THP split test done\n"); return; cleanup: + num_written = write(shmem_sysctl_fd, sysctl_buf, strlen(sysctl_buf) + 1); + if (num_written < 1) + ksft_perror("Fail to restore shmem sysctl"); umount(tmpfs_loc); rmdir(tmpfs_loc); +cleanup_sysctl: + close(shmem_sysctl_fd); +out: ksft_exit_fail_msg("Error occurred\n"); } -- 2.45.2
Re: [PATCH 1/3] selftests/mm: make file-backed THP split work by setting force option
On Wed Jan 22, 2025 at 9:26 AM EST, David Hildenbrand wrote: > On 22.01.25 13:40, Zi Yan wrote: >> Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") >> changes huge=always to allocate THP/mTHP based on write size and >> split_huge_page_test does not write PMD size data, so file-back THP is not >> created during the test. > > Just curious, why can't we write PMD size data instead, to avoid messing > with the "force" option? It also works. I used "force", because I notice that it is intended for testing. Using it might be more future proof, in case huge=always changes its semantics again in the future. -- Best Regards, Yan, Zi
Re: [PATCH 1/3] selftests/mm: make file-backed THP split work by setting force option
On 22.01.25 13:40, Zi Yan wrote: Commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") changes huge=always to allocate THP/mTHP based on write size and split_huge_page_test does not write PMD size data, so file-back THP is not created during the test. Just curious, why can't we write PMD size data instead, to avoid messing with the "force" option? -- Cheers, David / dhildenb
Re: [PATCH 2/3] mm/huge_memory: allow split shmem large folio to any lower order
On 22.01.25 13:40, Zi Yan wrote: Commit 4d684b5f92ba ("mm: shmem: add large folio support for tmpfs") has added large folio support to shmem. Remove the restriction in split_huge_page*(). Signed-off-by: Zi Yan Reviewed-by: Baolin Wang --- mm/huge_memory.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..deb4e72daeb9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3299,7 +3299,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Some pages can be beyond EOF: drop them from page cache */ if (tail->index >= end) { if (shmem_mapping(folio->mapping)) - nr_dropped++; + nr_dropped += new_nr; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); @@ -3465,12 +3465,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, return -EINVAL; } } else if (new_order) { - /* Split shmem folio to non-zero order not supported */ - if (shmem_mapping(folio->mapping)) { - VM_WARN_ONCE(1, - "Cannot split shmem folio to non-0 order"); - return -EINVAL; - } /* * No split if the file system does not support large folio. * Note that we might still have THPs in such mappings due to Acked-by: David Hildenbrand -- Cheers, David / dhildenb
Re: [PATCH net-next v1 5/5] selftest: net: add proc_net_pktgen
Hello Jakub, On Fri, 17 Jan 2025 13:11:54 -0800, Jakub Kicinski wrote: > On Fri, 17 Jan 2025 15:16:13 +0100 Peter Seiderer wrote: > > +FIXTURE_SETUP(proc_net_pktgen) { > > + ssize_t len; > > + > > + self->ctrl_fd = open("/proc/net/pktgen/kpktgend_0", O_RDWR); > > + ASSERT_GE(self->ctrl_fd, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, > > module pktgen nod loaded?"); > > nod -> not? Fixed... > > Please take a look at the instructions here: > https://github.com/linux-netdev/nipa/wiki/How-to-run-netdev-selftests-CI-style > the test currently fails in our CI, you need to add it to > tools/testing/selftests/net/config, and perhaps try to call > modprobe in the test? Thanks for the hint, fixed (modprobe and CONFIG_NET_PKTGEN enabeled)... > > > + len = write(self->ctrl_fd, add_loopback_0, sizeof(add_loopback_0)); > > + ASSERT_EQ(len, sizeof(add_loopback_0)) TH_LOG("device lo@0 already > > registered?"); > > FWIW we prefer to stick to 80 char line width in networking, > but it's not a big deal for a test, up to you. > > > + // complete command string without/with trailing '\0' > > +EXPECT_EQ(len, i); Fixed... > > Run this patch thru checkpatch, please. This looks misaligned. O.k. > > > + } > > + } > > +} > > > +#if 0 // needs CONFIG_XFRM > > Add it to the config, too, then? > > > +TEST_F(proc_net_pktgen, device_command_spi) { > > + ssize_t len; > > + > > + len = write(self->device_fd, device_command_spi_0, > > sizeof(device_command_spi_0)); > > + EXPECT_EQ(len, sizeof(device_command_spi_0)); > > +} > > +#endif '#if' removed as as CONFIG_XFRM is already enabled via tools/testing/selftests/net/config CONFIG_XFRM_INTERFACE/CONFIG_XFRM_USER... Thanks for review! New patch iteration is on the way... Regards, Peter > > Thanks for working on a test!
Re: [PATCH] selftests: gpio: gpio-sim: Fix missing chip disablements
On Wed, Jan 22, 2025 at 5:33 AM Koichiro Den wrote: > > Since upstream commit 8bd76b3d3f3a ("gpio: sim: lock up configfs that an > instantiated device depends on"), rmdir for an active virtual devices > been prohibited. > > Update gpio-sim selftest to align with the change. > > Reported-by: kernel test robot > Closes: https://lore.kernel.org/oe-lkp/202501221006.a1ca5dfa-...@intel.com > Signed-off-by: Koichiro Den > --- > tools/testing/selftests/gpio/gpio-sim.sh | 31 +++- > 1 file changed, 25 insertions(+), 6 deletions(-) > > diff --git a/tools/testing/selftests/gpio/gpio-sim.sh > b/tools/testing/selftests/gpio/gpio-sim.sh > index 6fb66a687f17..bbc29ed9c60a 100755 > --- a/tools/testing/selftests/gpio/gpio-sim.sh > +++ b/tools/testing/selftests/gpio/gpio-sim.sh > @@ -46,12 +46,6 @@ remove_chip() { > rmdir $CONFIGFS_DIR/$CHIP || fail "Unable to remove the chip" > } > > -configfs_cleanup() { > - for CHIP in `ls $CONFIGFS_DIR/`; do > - remove_chip $CHIP > - done > -} > - > create_chip() { > local CHIP=$1 > > @@ -105,6 +99,13 @@ disable_chip() { > echo 0 > $CONFIGFS_DIR/$CHIP/live || fail "Unable to disable the chip" > } > > +configfs_cleanup() { > + for CHIP in `ls $CONFIGFS_DIR/`; do > + disable_chip $CHIP > + remove_chip $CHIP > + done > +} > + > configfs_chip_name() { > local CHIP=$1 > local BANK=$2 > @@ -181,6 +182,7 @@ create_chip chip > create_bank chip bank > enable_chip chip > test -n `cat $CONFIGFS_DIR/chip/bank/chip_name` || fail "chip_name doesn't > work" > +disable_chip chip > remove_chip chip > Hi! Thanks for addressing it. Is there any place in this file where we'd call remove_chip() without calling disable_chip() first? Maybe we can fold disable_chip() into remove_chip() and make the patch much smaller? Bart
Re: [PATCH v2 0/5] Move kvfree_rcu() into SLAB (v2)
On 1/22/25 16:04, Joel Fernandes wrote: > On Tue, Jan 21, 2025 at 3:32 PM Paul E. McKenney wrote: >> >> On Tue, Jan 21, 2025 at 03:14:16PM +0100, Uladzislau Rezki wrote: >> > On Tue, Jan 21, 2025 at 02:49:13PM +0100, Vlastimil Babka wrote: >> > > Right so that's the first Possible solution, but without the #ifdef. So >> > > there's an overhead of checking __is_kvfree_rcu_offset() even if the >> > > batching is done in slab and this function is never called with an >> > > offset. >> > > >> > Or fulfilling a missing functionality? TREE is broken in that sense >> > whereas a TINY handles it without any issues. >> > >> > It can be called for SLUB_TINY option, just call_rcu() instead of >> > batching layer. And yes, kvfree_rcu_barrier() switches to rcu_barrier(). >> >> Would this make sense? >> >> if (IS_ENABLED(CONFIG_TINY_RCU) && >> __is_kvfree_rcu_offset((unsigned long) f)) { >> >> Just to be repetitive, other alternatives include: >> >> 1. Take advantage of SLOB being no longer with us. >> >> 2. Get rid of Tiny RCU's special casing of kfree_rcu(), and then >> eliminate the above "if" statement in favor of its "else" clause. >> >> 3. Make Tiny RCU implement a trivial version of kfree_rcu() that >> passes a list through RCU. >> >> I don't have strong feelings, and am happy to defer to your guys' >> decision. > > If I may chime in with an opinion, I think the cleanest approach would > be to not special-case the func pointer and instead provide a callback > from the SLAB layer which does the kfree. Then get rid of Right. > __is_kvfree_rcu_offset() and its usage from Tiny. Granted, there is > the overhead of function calling, but I highly doubt that it is going > to be a bottleneck, considering that the __is_kvfree_rcu_offset() path > is a kfree slow-path. I feel in the long run, this will also be more > maintainable. > > Or is there a reason other than the theoretical function call overhead > why this may not work? My concern was about the overhead of calculating the pointer to the object starting address, but it's just some arithmetics, so it should be negligible. So I'm prototyping this approach now. Thanks all. > thanks, > > - Joel
[RFC 2/5] virtio_ring: introduce virtqueue_map/unmap_sgs()
From: Jason Wang Introduce new virtqueue DMA operations which allows the drivers that want to make use of the premapping API but operate at the sg level. Note that we still follow the assumtions if virtqueue_add() so dma_map_sg() is not used. This could be optimized in the future. Signed-off-by: Jason Wang Signed-off-by: Eugenio Pérez -- Eugenio's changes: Remove blank TODO: Should we call directly dma_map instead of this? XDP do the direct call. --- drivers/virtio/virtio_ring.c | 128 +++ include/linux/virtio.h | 10 +++ 2 files changed, 125 insertions(+), 13 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index fdd2d2b07b5a..05729bc5cbb1 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -359,6 +359,26 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) return vq->dma_dev; } +static int __vring_map_one_sg(const struct vring_virtqueue *vq, + struct scatterlist *sg, + enum dma_data_direction direction, + dma_addr_t *addr) +{ + /* +* We can't use dma_map_sg, because we don't use scatterlists in +* the way it expects (we don't guarantee that the scatterlist +* will exist for the lifetime of the mapping). +*/ + *addr = dma_map_page(vring_dma_dev(vq), + sg_page(sg), sg->offset, sg->length, + direction); + + if (dma_mapping_error(vring_dma_dev(vq), *addr)) + return -ENOMEM; + + return 0; +} + /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr, @@ -383,19 +403,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist return 0; } - /* -* We can't use dma_map_sg, because we don't use scatterlists in -* the way it expects (we don't guarantee that the scatterlist -* will exist for the lifetime of the mapping). -*/ - *addr = dma_map_page(vring_dma_dev(vq), - sg_page(sg), sg->offset, sg->length, - direction); - - if (dma_mapping_error(vring_dma_dev(vq), *addr)) - return -ENOMEM; - - return 0; + return __vring_map_one_sg(vq, sg, direction, addr); } static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, @@ -526,6 +534,100 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, return next; } +void virtqueue_unmap_sgs(struct virtqueue *_vq, +struct scatterlist *sgs[], +unsigned int out_sgs, +unsigned int in_sgs) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct scatterlist *sg; + int n; + + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_unmap_page(vring_dma_dev(vq), + sg_dma_address(sg), + sg->length, + DMA_TO_DEVICE); + } + } + + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_unmap_page(vring_dma_dev(vq), + sg_dma_address(sg), + sg->length, + DMA_FROM_DEVICE); + } + } +} +EXPORT_SYMBOL_GPL(virtqueue_unmap_sgs); + +int virtqueue_map_sgs(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + int i, n, mapped_sg = 0; + struct scatterlist *sg; + + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + + if (__vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) + goto unmap_release; + + sg_dma_address(sg) = addr; + mapped_sg++; + } + } + + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + + if (__vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) + goto unmap_release; + + sg_dma_address(sg) = addr; + mapped_sg++; + } + } + + return 0; + +unmap_release: + i = 0; + + for (n = 0; n < out_sgs; n++) { + for (sg
Re: [PATCH v2 0/5] Move kvfree_rcu() into SLAB (v2)
On Wed, Jan 22, 2025 at 11:43 AM Vlastimil Babka wrote: [...] > > __is_kvfree_rcu_offset() and its usage from Tiny. Granted, there is > > the overhead of function calling, but I highly doubt that it is going > > to be a bottleneck, considering that the __is_kvfree_rcu_offset() path > > is a kfree slow-path. I feel in the long run, this will also be more > > maintainable. > > > > Or is there a reason other than the theoretical function call overhead > > why this may not work? > > My concern was about the overhead of calculating the pointer to the object > starting address, but it's just some arithmetics, so it should be > negligible. So I'm prototyping this approach now. Thanks all. Ah, that's a valid point. Looking forward to reviewing the patch and hope it works out! thanks, - Joel
Re: [PATCH net-next v2 0/8] Some pktgen fixes/improvments
On Wed, Jan 22, 2025 at 03:41:02PM +0100, Peter Seiderer wrote: > While taking a look at '[PATCH net] pktgen: Avoid out-of-range in > get_imix_entries' ([1]) and '[PATCH net v2] pktgen: Avoid out-of-bounds access > in get_imix_entries' ([2], [3]) and doing some tests and code review I > detected that the /proc/net/pktgen/... parsing logic does not honour the > user given buffer bounds (resulting in out-of-bounds access). > > This can be observed e.g. by the following simple test (sometimes the > old/'longer' previous value is re-read from the buffer): > > $ echo add_device lo@0 > /proc/net/pktgen/kpktgend_0 > > $ echo "min_pkt_size 12345" > /proc/net/pktgen/lo\@0 && grep > min_pkt_size /proc/net/pktgen/lo\@0 > Params: count 1000 min_pkt_size: 12345 max_pkt_size: 0 > Result: OK: min_pkt_size=12345 > > $ echo -n "min_pkt_size 123" > /proc/net/pktgen/lo\@0 && grep > min_pkt_size /proc/net/pktgen/lo\@0 > Params: count 1000 min_pkt_size: 12345 max_pkt_size: 0 > Result: OK: min_pkt_size=12345 > > $ echo "min_pkt_size 123" > /proc/net/pktgen/lo\@0 && grep > min_pkt_size /proc/net/pktgen/lo\@0 > Params: count 1000 min_pkt_size: 123 max_pkt_size: 0 > Result: OK: min_pkt_size=123 > > So fix the out-of-bounds access (and two minor findings) and add a simple > proc_net_pktgen selftest... Hi Peter, Unfortunately net-next is closed at this time. ## Form letter - net-next-closed The merge window for v6.14 has begun. Therefore net-next is closed for new drivers, features, code refactoring and optimizations. We are currently accepting bug fixes only. Please repost when net-next reopens after Feb 3rd. RFC patches sent for review only are obviously welcome at any time. See: https://www.kernel.org/doc/html/next/process/maintainer-netdev.html#development-cycle -- pw-bot: deferred
Re: [PATCH net-next v2 0/8] Some pktgen fixes/improvments
On Wed, Jan 22, 2025 at 06:16:35PM +, Simon Horman wrote: > On Wed, Jan 22, 2025 at 03:41:02PM +0100, Peter Seiderer wrote: > > While taking a look at '[PATCH net] pktgen: Avoid out-of-range in > > get_imix_entries' ([1]) and '[PATCH net v2] pktgen: Avoid out-of-bounds > > access > > in get_imix_entries' ([2], [3]) and doing some tests and code review I > > detected that the /proc/net/pktgen/... parsing logic does not honour the > > user given buffer bounds (resulting in out-of-bounds access). > > > > This can be observed e.g. by the following simple test (sometimes the > > old/'longer' previous value is re-read from the buffer): > > > > $ echo add_device lo@0 > /proc/net/pktgen/kpktgend_0 > > > > $ echo "min_pkt_size 12345" > /proc/net/pktgen/lo\@0 && grep > > min_pkt_size /proc/net/pktgen/lo\@0 > > Params: count 1000 min_pkt_size: 12345 max_pkt_size: 0 > > Result: OK: min_pkt_size=12345 > > > > $ echo -n "min_pkt_size 123" > /proc/net/pktgen/lo\@0 && grep > > min_pkt_size /proc/net/pktgen/lo\@0 > > Params: count 1000 min_pkt_size: 12345 max_pkt_size: 0 > > Result: OK: min_pkt_size=12345 > > > > $ echo "min_pkt_size 123" > /proc/net/pktgen/lo\@0 && grep > > min_pkt_size /proc/net/pktgen/lo\@0 > > Params: count 1000 min_pkt_size: 123 max_pkt_size: 0 > > Result: OK: min_pkt_size=123 > > > > So fix the out-of-bounds access (and two minor findings) and add a simple > > proc_net_pktgen selftest... > > Hi Peter, > > Unfortunately net-next is closed at this time. Sorry, I was a little hasty. It looks like a number of these patches are fixes. If so, please consider separating them out into a separate series, targeted at net (rather than net-next), each with an appropriate Fixes tag.
Re: [PATCH 5/5] selftests/nolibc: always keep test kernel configuration up to date
Hi Willy! On 2025-01-22 19:52:06+0100, Willy Tarreau wrote: > On Wed, Jan 22, 2025 at 07:41:48PM +0100, Thomas Weißschuh wrote: > > @@ -173,7 +170,7 @@ test_arch() { > > exit 1 > > esac > > printf '%-15s' "$arch:" > > - swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" > > V=1 > > + swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" defconfig > > "$test_target" V=1 > > Just a question, are you certain that dependencies between $test_target > and defconfig are always properly handled ? I'm asking because "make -j" > is something valid, and we wouldn't want defconfig to run in parallel > with test_target. "make -j" is not only valid but used by run-tests.sh always. The sequencing is explicitly enforced in patch 4. > For real sequencing (and making sure targets run in the > correct order), I normally prefer to run them one at a time. Here you could > simply prepend the defconfig line before the original one and get these > guarantees (and also make them explicit). That's also less edit when > copy-pasting from the terminal to the shell when trying to debug. Sounds fine to me, too. That would remove the need for patch 4, but I'd like to keep it anyways. Thomas
[PATCH] bpf: Fix mix-up of 4096 and page size.
For platforms on powerpc architecture with a default page size greater than 4096, there was an inconsistency in fragment size calculation. This caused the BPF selftest xdp_adjust_tail/xdp_adjust_frags_tail_grow to fail on powerpc. The issue occurred because the fragment buffer size in bpf_prog_test_run_xdp() was set to 4096, while the actual data size in the fragment within the shared skb was checked against PAGE_SIZE (65536 on powerpc) in min_t, causing it to exceed 4096 and be set accordingly. This discrepancy led to an overflow when bpf_xdp_frags_increase_tail() checked for tailroom, as skb_frag_size(frag) could be greater than rxq->frag_size (when PAGE_SIZE > 4096). This commit updates the page size references to 4096 to ensure consistency and prevent overflow issues in fragment size calculations. Fixes: 1c1949982524 ("bpf: introduce frags support to bpf_prog_test_run_xdp()") Signed-off-by: Saket Kumar Bhaskar --- net/bpf/test_run.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 501ec4249..eb5476184 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -124,7 +124,7 @@ struct xdp_test_data { * must be updated accordingly this gets changed, otherwise BPF selftests * will fail. */ -#define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)) +#define TEST_XDP_FRAME_SIZE (4096 - sizeof(struct xdp_page_head)) #define TEST_XDP_MAX_BATCH 256 static void xdp_test_run_init_page(netmem_ref netmem, void *arg) @@ -660,7 +660,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + if (size < ETH_HLEN || size > 4096 - headroom - tailroom) return ERR_PTR(-EINVAL); if (user_size > size) @@ -1297,7 +1297,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, frag = &sinfo->frags[sinfo->nr_frags++]; data_len = min_t(u32, kattr->test.data_size_in - size, -PAGE_SIZE); +4096); skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, -- 2.43.5
Re: [PATCH v5 6/6] vhost_scsi: Add check for inherit_owner status
On 12/30/24 6:43 AM, Cindy Lu wrote: > The vhost_scsi VHOST_NEW_WORKER requires the inherit_owner > setting to be true. So we need to implement a check for this. > > Signed-off-by: Cindy Lu > --- > drivers/vhost/scsi.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c > index 718fa4e0b31e..0d63b6b5c852 100644 > --- a/drivers/vhost/scsi.c > +++ b/drivers/vhost/scsi.c > @@ -2086,6 +2086,14 @@ vhost_scsi_ioctl(struct file *f, > return -EFAULT; > return vhost_scsi_set_features(vs, features); > case VHOST_NEW_WORKER: > + /* > + * vhost_tasks will account for worker threads under the > parent's > + * NPROC value but kthreads do not. To avoid userspace > overflowing > + * the system with worker threads inherit_owner must be true. > + */ > + if (!vs->dev.inherit_owner)> + return -EFAULT; Why did you keep this here? I had mentioned it belonged in the common code: https://lore.kernel.org/virtualization/3864ae3b-d6cd-4227-b4bb-56e014c71...@oracle.com/T/#u because the limitation applies to all drivers. I didn't see a reply about it but saw you did fix up the comment. Not sure if I missed it or there was a technical issue you hit. > + fallthrough; > case VHOST_FREE_WORKER: > case VHOST_ATTACH_VRING_WORKER: > case VHOST_GET_VRING_WORKER:
Re: [PATCH v4 4/4] rust: add parameter support to the `module!` macro
"Petr Pavlu" writes: > On 1/9/25 11:54, Andreas Hindborg wrote: >> This patch includes changes required for Rust kernel modules to utilize >> module parameters. This code implements read only support for integer >> types without `sysfs` support. >> [cut] >> + >> +// SAFETY: `old_value` is valid for writes, as we have exclusive >> +// access. `old_value` is pointing to an initialized static, an > > Typo: an -> and Thanks. [cut] >> + >> +fn emit_params(&mut self, info: &ModuleInfo) { >> +let Some(params) = &info.params else { >> +return; >> +}; >> + >> +for param in params { >> +let ops = param_ops_path(¶m.ptype); >> + >> +// Note: The spelling of these fields is dictated by the user >> space >> +// tool `modinfo`. >> +self.emit_param("parmtype", ¶m.name, ¶m.ptype); >> +self.emit_param("parm", ¶m.name, ¶m.description); >> + >> +write!( >> +self.param_buffer, >> +" >> +pub(crate) static {param_name}: >> + >> ::kernel::module_param::ModuleParamAccess<{param_type}> = >> + >> ::kernel::module_param::ModuleParamAccess::new({param_default}); >> + >> +#[link_section = \"__param\"] >> +#[used] >> +static __{module_name}_{param_name}_struct: >> +::kernel::module_param::RacyKernelParam = >> + >> ::kernel::module_param::RacyKernelParam(::kernel::bindings::kernel_param {{ >> +name: if cfg!(MODULE) {{ >> +c\"{module_name}.{param_name}\" >> +}} else {{ >> +c\"{param_name}\" >> +}}.as_ptr(), > > This should be the other way around? Running > 'insmod rust_minimal.ko test_parameter=2' reports for me that the > parameter is unknown while specifying 'rust_minimal.test_parameter=2' > works. The prefix should be used only if the module is built-in so the > parameter can be distinguished on the kernel command line. Thanks for testing! This actually gave me quite a headache when I was using the series elsewhere. I forgot to resend the series with the fix, sorry. Best regards, Andreas Hindborg
Re: [RFC net-next v3 2/4] virtio_net: Prepare for NAPI to queue mapping
On Wed, Jan 22, 2025 at 02:12:46PM +0800, Jason Wang wrote: > On Wed, Jan 22, 2025 at 3:11 AM Joe Damato wrote: > > > > Slight refactor to prepare the code for NAPI to queue mapping. No > > functional changes. > > > > Signed-off-by: Joe Damato > > Reviewed-by: Gerhard Engleder > > Tested-by: Lei Yang > > --- > > v2: > >- Previously patch 1 in the v1. > >- Added Reviewed-by and Tested-by tags to commit message. No > > functional changes. > > > > drivers/net/virtio_net.c | 10 -- > > 1 file changed, 8 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 7646ddd9bef7..cff18c66b54a 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -2789,7 +2789,8 @@ static void skb_recv_done(struct virtqueue *rvq) > > virtqueue_napi_schedule(&rq->napi, rvq); > > } > > > > -static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct > > *napi) > > +static void virtnet_napi_do_enable(struct virtqueue *vq, > > + struct napi_struct *napi) > > { > > napi_enable(napi); > > Nit: it might be better to not have this helper to avoid a misuse of > this function directly. Sorry, I'm probably missing something here. Both virtnet_napi_enable and virtnet_napi_tx_enable need the logic in virtnet_napi_do_enable. Are you suggesting that I remove virtnet_napi_do_enable and repeat the block of code in there twice (in virtnet_napi_enable and virtnet_napi_tx_enable)? Just seemed like a lot of code to repeat twice and a helper would be cleaner? Let me know; since net-next is closed there is a plenty of time to get this to where you'd like it to be before new code is accepted. > Other than this. > > Acked-by: Jason Wang Thanks.
Re: [PATCH 5/5] selftests/nolibc: always keep test kernel configuration up to date
Hi Thomas! On Wed, Jan 22, 2025 at 07:41:48PM +0100, Thomas Weißschuh wrote: > @@ -173,7 +170,7 @@ test_arch() { > exit 1 > esac > printf '%-15s' "$arch:" > - swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" > V=1 > + swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" defconfig > "$test_target" V=1 Just a question, are you certain that dependencies between $test_target and defconfig are always properly handled ? I'm asking because "make -j" is something valid, and we wouldn't want defconfig to run in parallel with test_target. For real sequencing (and making sure targets run in the correct order), I normally prefer to run them one at a time. Here you could simply prepend the defconfig line before the original one and get these guarantees (and also make them explicit). That's also less edit when copy-pasting from the terminal to the shell when trying to debug. Just my few cents ;-) Willy
[PATCH 2/2] libbpf: BPF programs dynamic loading and attaching
BPF programs designated as dynamically loaded can be loaded and attached independently after the initial bpf_object loading and attaching. These programs can also be reloaded and reattached multiple times, enabling more flexible management of a resident BPF program set. A key motivation for this feature is to reduce load times for utilities that include hundreds of BPF programs. When the selection of a resident BPF program set cannot be determined at the time of bpf_object loading and attaching, all BPF programs would otherwise need to be marked as autoload, leading to unnecessary overhead. This patch addresses that inefficiency. Signed-off-by: Slava Imameev --- tools/lib/bpf/libbpf.c| 144 +++-- tools/lib/bpf/libbpf.h| 5 +- tools/lib/bpf/libbpf.map | 2 + .../selftests/bpf/prog_tests/dynamicload.c| 145 ++ .../selftests/bpf/prog_tests/load_type.c | 61 .../selftests/bpf/progs/test_dynamicload.c| 31 .../selftests/bpf/progs/test_load_type.c | 8 + 7 files changed, 385 insertions(+), 11 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/dynamicload.c create mode 100644 tools/testing/selftests/bpf/progs/test_dynamicload.c diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9af5c0b08b8b..731a4a09f865 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -689,6 +689,7 @@ struct bpf_object { bool loaded; bool has_subcalls; bool has_rodata; + bool has_dynload_progs; struct bpf_gen *gen_loader; @@ -7551,13 +7552,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog * custom log_buf is specified; if the program load fails, then we'll * bump log_level to 1 and use either custom log_buf or we'll allocate * our own and retry the load to get details on what failed +* A shared buffer cannot be used for dynamically loaded programs as they +* can be loaded concurrently. */ if (log_level) { if (prog->log_buf) { log_buf = prog->log_buf; log_buf_size = prog->log_size; own_log_buf = false; - } else if (obj->log_buf) { + } else if (obj->log_buf && prog->load_type != BPF_PROG_LOAD_TYPE_DYNAMIC) { log_buf = obj->log_buf; log_buf_size = obj->log_size; own_log_buf = false; @@ -7911,6 +7914,7 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) pr_debug("prog '%s': skipped auto-loading\n", prog->name); continue; } + prog->log_level |= log_level; if (obj->gen_loader) @@ -8588,8 +8592,11 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); } - /* clean up fd_array */ - zfree(&obj->fd_array); + /* The fd array is needed for dynamically loaded programs, +* so defer freeing it in that case to the end of the object lifetime. +*/ + if (!obj->has_dynload_progs || !obj->fd_array_cnt) + zfree(&obj->fd_array); /* clean up module BTFs */ for (i = 0; i < obj->btf_module_cnt; i++) { @@ -8597,11 +8604,17 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch btf__free(obj->btf_modules[i].btf); free(obj->btf_modules[i].name); } - free(obj->btf_modules); + obj->btf_module_cnt = 0; + zfree(&obj->btf_modules); - /* clean up vmlinux BTF */ - btf__free(obj->btf_vmlinux); - obj->btf_vmlinux = NULL; + /* The btf_vmlinux data is needed for dynamically loaded programs, +* so defer freeing it in that case to the end of the object lifetime. +*/ + if (!obj->has_dynload_progs) { + /* clean up vmlinux BTF */ + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; + } obj->loaded = true; /* doesn't matter if successfully or not */ @@ -9103,6 +9116,8 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->arena_data); + zfree(&obj->fd_array); + free(obj); } @@ -9230,8 +9245,16 @@ bool bpf_program__autoload(const struct bpf_program *prog) int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) { - return bpf_program__set_load_type(prog, - autoload ? BPF_PROG_LOAD_TYPE_AUTO : BPF_PROG_LOAD_TYPE_DISABLED); + enum bpf_prog_load_type type = prog->load_type; + + if (autoload) + type = BPF_PROG_LOAD_TYPE_AUTO; + else if (prog->load_type == B
Re: [PATCH 5/5] selftests/nolibc: always keep test kernel configuration up to date
On Wed, Jan 22, 2025 at 08:00:28PM +0100, Thomas Weißschuh wrote: > Hi Willy! > > On 2025-01-22 19:52:06+0100, Willy Tarreau wrote: > > On Wed, Jan 22, 2025 at 07:41:48PM +0100, Thomas Weißschuh wrote: > > > @@ -173,7 +170,7 @@ test_arch() { > > > exit 1 > > > esac > > > printf '%-15s' "$arch:" > > > - swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" > > > V=1 > > > + swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" defconfig > > > "$test_target" V=1 > > > > Just a question, are you certain that dependencies between $test_target > > and defconfig are always properly handled ? I'm asking because "make -j" > > is something valid, and we wouldn't want defconfig to run in parallel > > with test_target. > > "make -j" is not only valid but used by run-tests.sh always. > The sequencing is explicitly enforced in patch 4. I learned something today, I didn't know about order-only rules. > > For real sequencing (and making sure targets run in the > > correct order), I normally prefer to run them one at a time. Here you could > > simply prepend the defconfig line before the original one and get these > > guarantees (and also make them explicit). That's also less edit when > > copy-pasting from the terminal to the shell when trying to debug. > > Sounds fine to me, too. > That would remove the need for patch 4, but I'd like to keep it anyways. Agreed! Willy
Re: [PATCH v2 0/5] Move kvfree_rcu() into SLAB (v2)
On Wed, Jan 22, 2025 at 05:43:06PM +0100, Vlastimil Babka wrote: > On 1/22/25 16:04, Joel Fernandes wrote: > > On Tue, Jan 21, 2025 at 3:32 PM Paul E. McKenney wrote: > >> > >> On Tue, Jan 21, 2025 at 03:14:16PM +0100, Uladzislau Rezki wrote: > >> > On Tue, Jan 21, 2025 at 02:49:13PM +0100, Vlastimil Babka wrote: > >> > > Right so that's the first Possible solution, but without the #ifdef. So > >> > > there's an overhead of checking __is_kvfree_rcu_offset() even if the > >> > > batching is done in slab and this function is never called with an > >> > > offset. > >> > > > >> > Or fulfilling a missing functionality? TREE is broken in that sense > >> > whereas a TINY handles it without any issues. > >> > > >> > It can be called for SLUB_TINY option, just call_rcu() instead of > >> > batching layer. And yes, kvfree_rcu_barrier() switches to rcu_barrier(). > >> > >> Would this make sense? > >> > >> if (IS_ENABLED(CONFIG_TINY_RCU) && > >> __is_kvfree_rcu_offset((unsigned long) f)) { > >> > >> Just to be repetitive, other alternatives include: > >> > >> 1. Take advantage of SLOB being no longer with us. > >> > >> 2. Get rid of Tiny RCU's special casing of kfree_rcu(), and then > >> eliminate the above "if" statement in favor of its "else" clause. > >> > >> 3. Make Tiny RCU implement a trivial version of kfree_rcu() that > >> passes a list through RCU. > >> > >> I don't have strong feelings, and am happy to defer to your guys' > >> decision. > > > > If I may chime in with an opinion, I think the cleanest approach would > > be to not special-case the func pointer and instead provide a callback > > from the SLAB layer which does the kfree. Then get rid of > > Right. > > > __is_kvfree_rcu_offset() and its usage from Tiny. Granted, there is > > the overhead of function calling, but I highly doubt that it is going > > to be a bottleneck, considering that the __is_kvfree_rcu_offset() path > > is a kfree slow-path. I feel in the long run, this will also be more > > maintainable. > > > > Or is there a reason other than the theoretical function call overhead > > why this may not work? > > My concern was about the overhead of calculating the pointer to the object > starting address, but it's just some arithmetics, so it should be > negligible. So I'm prototyping this approach now. Thanks all. > You are welcome :) -- Uladzislau Rezki
Re: [GIT PULL] kunit next update for Linux 6.14-rc1
The pull request you sent on Tue, 21 Jan 2025 14:16:44 -0700: > git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest > tags/linux_kselftest-kunit-6.14-rc1 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/e8f17cb6f5abd4e52e89b5768c7016b7dab1e6fe Thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/prtracker.html
Re: [GIT PULL] kselftest next update for Linux 6.14-rc1
The pull request you sent on Tue, 21 Jan 2025 13:56:32 -0700: > git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest > tags/linux_kselftest-next-6.14-rc1 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/8fb1e2eed14dc347e1d04b8bf0bf52c606de6da1 Thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/prtracker.html
[PATCH] media: add virtio-media driver
Add the first version of the virtio-media driver. This driver acts roughly as a V4L2 relay between user-space and the virtio virtual device on the host, so it is relatively simple, yet unconventional. It doesn't use VB2 or other frameworks typically used in a V4L2 driver, and most of its complexity resides in correctly and efficiently building the virtio descriptor chain to pass to the host, avoiding copies whenever possible. This is done by scatterlist_builder.[ch]. virtio_media_ioctls.c proxies each supported ioctl to the host, using code generated through macros for ioctls that can be forwarded directly, which is most of them. virtio_media_driver.c provides the expected driver hooks, and support for mmapping and polling. This version supports MMAP buffers, while USERPTR buffers can also be enabled through a driver option. DMABUF support is still pending. Signed-off-by: Alexandre Courbot Signed-off-by: Alexandre Courbot --- This patch adds the virtio-media kernel driver. Virtio-media [1] encapsulates the V4L2 structures and protocol to enable the virtualization of host media devices into a guest. It's specification is in the final stages [2] of being merged and the virtualization of cameras and video accelerator devices has already been demonstrated using crosvm [3] and QEmu. v4l2-compliance also passes on all tested devices, which includes the "simple" virtual test device, proxied host UVC and vivid devices, and the FFmpeg virtual decoder devices (refer to [3] in order to test these if desired). Virtio-media is merged in AOSP [4] and ChromeOS. Upstreaming of the driver is overdue, but I hope we can start the review process and converged into something that can be merged. Limitations: - The driver is currently only available to little-endian, 64-bit kernels. This is because some of the V4L2 structures used for communication between guest and host have a layout dependent on the architecture, and the virtio-media protocol is standardized on the little-endian 64-bit versions. This can be fixed with a conversion layer similar to the one used to convert 32-bit ioctls to their 64-bit counterpart. - DMABUF support is currently missing. It should be implemented using virtio objects, with possible support for memfds using the SHARED_PAGES memory type. - No support for the media API and requests. While the use-case for these is less important on virtual devices where we want to present an abstraction as high as possible to limit VM exits, they do exist and it would be nice to add behind a virtio feature bit. - Locking in the driver is still very basic. This is something I want to improve before merging, but I didn't want to delay upstream review any further. [1] https://github.com/chromeos/virtio-media [2] https://lore.kernel.org/virtio-comment/20250120085015.956057-1-aest...@redhat.com/T/ [3] https://github.com/chromeos/virtio-media/blob/main/TRY_IT_OUT.md [4] https://android.googlesource.com/platform/external/virtio-media/ --- MAINTAINERS|6 + drivers/media/Kconfig | 13 + drivers/media/Makefile |2 + drivers/media/virtio/Makefile |9 + drivers/media/virtio/protocol.h| 289 ++ drivers/media/virtio/scatterlist_builder.c | 564 drivers/media/virtio/scatterlist_builder.h | 111 +++ drivers/media/virtio/session.h | 109 +++ drivers/media/virtio/virtio_media.h| 93 ++ drivers/media/virtio/virtio_media_driver.c | 960 drivers/media/virtio/virtio_media_ioctls.c | 1302 include/uapi/linux/virtio_ids.h|1 + 12 files changed, 3459 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0fa7c5728f1e64d031f4a47b6fce1db484ce0fc2..42319dc40b2e902d3a2d5b7cca030006bc575979 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24920,6 +24920,12 @@ S: Maintained F: drivers/iommu/virtio-iommu.c F: include/uapi/linux/virtio_iommu.h +VIRTIO MEDIA DRIVER +M: Alexandre Courbot +L: linux-me...@vger.kernel.org +S: Maintained +F: drivers/media/virtio/ + VIRTIO MEM DRIVER M: David Hildenbrand L: virtualizat...@lists.linux.dev diff --git a/drivers/media/Kconfig b/drivers/media/Kconfig index 6abc9302cd84d8563b7877d3d3da4b7e05a6b5d2..12bbb169c0b04565271092c7ac608b0fb11c0244 100644 --- a/drivers/media/Kconfig +++ b/drivers/media/Kconfig @@ -230,6 +230,19 @@ source "drivers/media/platform/Kconfig" source "drivers/media/mmc/Kconfig" endif +config MEDIA_VIRTIO + tristate "Virtio-media Driver" + depends on VIRTIO && VIDEO_DEV && 64BIT && (X86 || (ARM && CPU_LITTLE_ENDIAN)) + select VIDEOBUF2_CORE + select VIDEOBUF2_MEMOPS + help + Enables the virtio-media driver. + + This driver is used to virtualize media devices such as cameras or + decoders from a host into a guest using the V4L2 pr
Re: [PATCH v2 2/3] mm/huge_memory: allow split shmem large folio to any lower order
On 1/22/25 8:19 AM, Zi Yan wrote: Commit 4d684b5f92ba ("mm: shmem: add large folio support for tmpfs") has added large folio support to shmem. Remove the restriction in split_huge_page*(). Reviewed-by: Yang Shi Signed-off-by: Zi Yan Reviewed-by: Baolin Wang --- mm/huge_memory.c | 8 +--- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..deb4e72daeb9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3299,7 +3299,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Some pages can be beyond EOF: drop them from page cache */ if (tail->index >= end) { if (shmem_mapping(folio->mapping)) - nr_dropped++; + nr_dropped += new_nr; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); @@ -3465,12 +3465,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, return -EINVAL; } } else if (new_order) { - /* Split shmem folio to non-zero order not supported */ - if (shmem_mapping(folio->mapping)) { - VM_WARN_ONCE(1, - "Cannot split shmem folio to non-0 order"); - return -EINVAL; - } /* * No split if the file system does not support large folio. * Note that we might still have THPs in such mappings due to
[PATCH 4/5] selftests/nolibc: execute defconfig before other targets
Some targets use the test kernel configuration. Executing defconfig in the same make invocation as those targets results in errors as the configuration may be in an inconsistent state during reconfiguration. Avoid this by introducing ordering dependencies between the defconfig and some other targets. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index d3afb71b4c6b7fc51b89f034c826692e76122864..b74fa74e5ce296f032bec76ce9b3f5a3debe2b40 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -217,7 +217,7 @@ all: run sysroot: sysroot/$(ARCH)/include -sysroot/$(ARCH)/include: +sysroot/$(ARCH)/include: | defconfig $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot $(Q)$(MAKE) -C $(srctree) outputmakefile @@ -263,10 +263,10 @@ initramfs: nolibc-test defconfig: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) -kernel: +kernel: | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -kernel-standalone: initramfs +kernel-standalone: initramfs | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel -- 2.48.1
[PATCH 3/5] selftests/nolibc: drop call to mrproper target
"mrproper" unnecessarily cleans a lot of files. kbuild is smart enough to handle changed configurations, so the cleanup is not necessary and only leads to excessive rebuilds. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 464165e3d9175d283ec0ed14765df29427b6de38..d3afb71b4c6b7fc51b89f034c826692e76122864 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -261,7 +261,7 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -- 2.48.1
[PATCH 0/5] selftests/nolibc: test kernel configuration cleanups
A few cleanups and optimizations for the management of the kernel configuration. Signed-off-by: Thomas Weißschuh --- Thomas Weißschuh (5): selftests/nolibc: drop custom EXTRACONFIG functionality selftests/nolibc: drop call to prepare target selftests/nolibc: drop call to mrproper target selftests/nolibc: execute defconfig before other targets selftests/nolibc: always keep test kernel configuration up to date tools/testing/selftests/nolibc/Makefile | 17 + tools/testing/selftests/nolibc/run-tests.sh | 5 + 2 files changed, 6 insertions(+), 16 deletions(-) --- base-commit: 60fe18237f72e3a186127658452dbb0992113cf7 change-id: 20250122-nolibc-config-d639e1612c93 Best regards, -- Thomas Weißschuh
[PATCH 1/5] selftests/nolibc: drop custom EXTRACONFIG functionality
kbuild already contains logic to merge predefines snippets into a defconfig file. This already works nicely with the current "defconfig" target. Make use of the snippet and drop the custom logic. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7d14a7c0cb62608f328b251495264517d333db2e..ba044c8a042ce345ff90bdd35569de4b5acd117d 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -82,7 +82,7 @@ DEFCONFIG_x86= defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm= multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig -DEFCONFIG_mips32be = malta_defconfig +DEFCONFIG_mips32be = malta_defconfig generic/eb.config DEFCONFIG_ppc= pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le= powernv_defconfig @@ -93,9 +93,6 @@ DEFCONFIG_s390 = defconfig DEFCONFIG_loongarch = defconfig DEFCONFIG= $(DEFCONFIG_$(XARCH)) -EXTRACONFIG_mips32be = -d CONFIG_CPU_LITTLE_ENDIAN -e CONFIG_CPU_BIG_ENDIAN -EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) - # optional tests to run (default = all) TEST = @@ -265,10 +262,6 @@ initramfs: nolibc-test defconfig: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare - $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ - $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ - $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ - fi kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -- 2.48.1
[PATCH 5/5] selftests/nolibc: always keep test kernel configuration up to date
Avoid using a stale test kernel configuration by always synchronizing it to the current source tree. kbuild is smart enough to avoid spurious rebuilds. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 9c5160c5388122deeeb59ecfced7633000d69b10..664f92e1c5500f726ab33247321b96e8602ce185 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -158,9 +158,6 @@ test_arch() { MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") mkdir -p "$build_dir" - if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then - swallow_output "${MAKE[@]}" defconfig - fi case "$test_mode" in 'system') test_target=run @@ -173,7 +170,7 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" - swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" V=1 + swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" defconfig "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed } -- 2.48.1
[PATCH 2/5] selftests/nolibc: drop call to prepare target
The "prepare" target does not need to be run manually. kbuild knows when to use it on its own and the target is not even documented. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index ba044c8a042ce345ff90bdd35569de4b5acd117d..464165e3d9175d283ec0ed14765df29427b6de38 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -261,7 +261,7 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -- 2.48.1
[PATCH 1/2] selftests/nolibc: rename s390 to s390x
Support for 32-bit s390 is about to be added. As "s39032" would look horrible, use the another naming scheme. 32-bit s390 is "s390" and 64-bit s390 is "s390x", similar to how it is handled in various toolchain components. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 11 ++- tools/testing/selftests/nolibc/run-tests.sh | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index b74fa74e5ce296f032bec76ce9b3f5a3debe2b40..dad0d9fd7702f8dc9a9a552cda5305f8973c5469 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -54,6 +54,7 @@ ARCH_mips32le= mips ARCH_mips32be= mips ARCH_riscv32 = riscv ARCH_riscv64 = riscv +ARCH_s390x = s390 ARCH:= $(or $(ARCH_$(XARCH)),$(XARCH)) # kernel image names by architecture @@ -70,7 +71,7 @@ IMAGE_ppc64le= arch/powerpc/boot/zImage IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32= arch/riscv/boot/Image IMAGE_riscv64= arch/riscv/boot/Image -IMAGE_s390 = arch/s390/boot/bzImage +IMAGE_s390x = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE= $(objtree)/$(IMAGE_$(XARCH)) IMAGE_NAME = $(notdir $(IMAGE)) @@ -89,7 +90,7 @@ DEFCONFIG_ppc64le= powernv_defconfig DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32= rv32_defconfig DEFCONFIG_riscv64= defconfig -DEFCONFIG_s390 = defconfig +DEFCONFIG_s390x = defconfig DEFCONFIG_loongarch = defconfig DEFCONFIG= $(DEFCONFIG_$(XARCH)) @@ -110,7 +111,7 @@ QEMU_ARCH_ppc64le= ppc64 QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32= riscv32 QEMU_ARCH_riscv64= riscv64 -QEMU_ARCH_s390 = s390x +QEMU_ARCH_s390x = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH= $(QEMU_ARCH_$(XARCH)) @@ -138,7 +139,7 @@ QEMU_ARGS_ppc64le= -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv32= -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64= -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS= -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA) @@ -156,7 +157,7 @@ CFLAGS_i386 = $(call cc-option,-m32) CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) -CFLAGS_s390 = -m64 +CFLAGS_s390x = -m64 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 664f92e1c5500f726ab33247321b96e8602ce185..fe4d48cc054abb1e922b31aa7b6a2395aaf61f5f 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -17,7 +17,7 @@ perform_download=0 test_mode=system werror=1 llvm= -archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390 loongarch" +archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390x loongarch" TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") @@ -100,6 +100,7 @@ crosstool_arch() { riscv) echo riscv64;; loongarch) echo loongarch64;; mips*) echo mips;; + s390*) echo s390;; *) echo "$1";; esac } -- 2.48.1
[PATCH 2/2] tools/nolibc: add support for 32-bit s390
32-bit s390 is very close to the existing 64-bit implementation. Some special handling is necessary as there is neither LLVM nor QEMU support. Also the kernel itself can not build natively for 32-bit s390, so instead the test program is executed with a 64-bit kernel. Signed-off-by: Thomas Weißschuh --- tools/include/nolibc/arch-s390.h| 5 + tools/include/nolibc/arch.h | 2 +- tools/testing/selftests/nolibc/Makefile | 5 + tools/testing/selftests/nolibc/run-tests.sh | 7 ++- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index f9ab83a219b8a2d5e53b0b303d8bf0bf78280d5f..3f2fb00ca101cc052f3b794b743e9e5734fe5a77 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -143,8 +143,13 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( +#ifdef __s390x__ "lgr%r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ "aghi %r15, -160\n" /* allocate new stackframe */ +#else + "lr %r2, %r15\n" + "ahi%r15, -160\n" +#endif "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index c8f4e5d3add9eb5b8a438900c084dc0449fcfbd6..8a2c143c0fba288147e5a7bf9db38ffb08367616 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -29,7 +29,7 @@ #include "arch-powerpc.h" #elif defined(__riscv) #include "arch-riscv.h" -#elif defined(__s390x__) +#elif defined(__s390x__) || defined(__s390__) #include "arch-s390.h" #elif defined(__loongarch__) #include "arch-loongarch.h" diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index dad0d9fd7702f8dc9a9a552cda5305f8973c5469..dc5cf19dd74389336b619e15e07f1c635241b7d7 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -72,6 +72,7 @@ IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32= arch/riscv/boot/Image IMAGE_riscv64= arch/riscv/boot/Image IMAGE_s390x = arch/s390/boot/bzImage +IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE= $(objtree)/$(IMAGE_$(XARCH)) IMAGE_NAME = $(notdir $(IMAGE)) @@ -91,6 +92,7 @@ DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32= rv32_defconfig DEFCONFIG_riscv64= defconfig DEFCONFIG_s390x = defconfig +DEFCONFIG_s390 = defconfig compat.config DEFCONFIG_loongarch = defconfig DEFCONFIG= $(DEFCONFIG_$(XARCH)) @@ -112,6 +114,7 @@ QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32= riscv32 QEMU_ARCH_riscv64= riscv64 QEMU_ARCH_s390x = s390x +QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH= $(QEMU_ARCH_$(XARCH)) @@ -140,6 +143,7 @@ QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_T QEMU_ARGS_riscv32= -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64= -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS= -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA) @@ -158,6 +162,7 @@ CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) CFLAGS_s390x = -m64 +CFLAGS_s390 = -m31 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index fe4d48cc054abb1e922b31aa7b6a2395aaf61f5f..79874f669b2a364ad1cb2399b4ebbab5ec6c9de9 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -17,7 +17,7 @@ perform_download=0 test_mode=system werror=1 llvm= -archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390x loongarch" +archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390x s390 loongarch" TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") @@ -171,6 +171,11 @@ test_arch() {
[PATCH 0/2] tools/nolibc: support for 32-bit s390
Support for 32-bit s390 is very easy to implement and useful for testing. For example I used to test some generic compat_ptr() logic, which is only testable on 32-bit s390. The series depends on my other series "selftests/nolibc: test kernel configuration cleanups". (It's not a hard dependency, only a minor diff conflict) Signed-off-by: Thomas Weißschuh --- Thomas Weißschuh (2): selftests/nolibc: rename s390 to s390x tools/nolibc: add support for 32-bit s390 tools/include/nolibc/arch-s390.h| 5 + tools/include/nolibc/arch.h | 2 +- tools/testing/selftests/nolibc/Makefile | 10 -- tools/testing/selftests/nolibc/run-tests.sh | 8 +++- 4 files changed, 21 insertions(+), 4 deletions(-) --- base-commit: 0597614d84c8593ba906418bf3c0c0de1e02e82a change-id: 20250122-nolibc-s390-e57141682c88 Best regards, -- Thomas Weißschuh
Re: [PATCH v5 0/6] vhost: Add support of kthread API
On 12/30/24 6:43 AM, Cindy Lu wrote: > In commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), > the vhost now uses vhost_task and operates as a child of the > owner thread. This aligns with containerization principles. > However, this change has caused confusion for some legacy > userspace applications. Therefore, we are reintroducing > support for the kthread API. > > In this series, a new UAPI is implemented to allow > userspace applications to configure their thread mode. > Tested the patches with a hacked up QEMU that can toggle the inherit owner bit so I could test both modes. Tested-by: Mike Christie
[PATCH 1/2] libbpf: BPF program load type enum
Replacing the boolean field with an enum simplifies the addition of new load types. Currently, the bpf_program structure defines the autoload type using a boolean field. This field is now replaced with an enum, allowing new BPF program loading types to be introduced by extending the enum value range. This patch is the first in a series of two. Signed-off-by: Slava Imameev --- tools/lib/bpf/libbpf.c| 50 --- tools/lib/bpf/libbpf.h| 16 ++ tools/lib/bpf/libbpf.map | 2 + .../selftests/bpf/prog_tests/load_type.c | 40 +++ .../selftests/bpf/progs/test_load_type.c | 23 + 5 files changed, 112 insertions(+), 19 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/load_type.c create mode 100644 tools/testing/selftests/bpf/progs/test_load_type.c diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 194809da5172..9af5c0b08b8b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -473,7 +473,7 @@ struct bpf_program { struct bpf_object *obj; int fd; - bool autoload; + enum bpf_prog_load_type load_type; bool autoattach; bool sym_global; bool mark_btf_static; @@ -824,11 +824,11 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, * autoload set to false. */ if (sec_name[0] == '?') { - prog->autoload = false; + prog->load_type = BPF_PROG_LOAD_TYPE_DISABLED; /* from now on forget there was ? in section name */ sec_name++; } else { - prog->autoload = true; + prog->load_type = BPF_PROG_LOAD_TYPE_AUTO; } prog->autoattach = true; @@ -1117,7 +1117,8 @@ static int bpf_object_adjust_struct_ops_autoload(struct bpf_object *obj) } } if (use_cnt) - prog->autoload = should_load; + prog->load_type = should_load ? BPF_PROG_LOAD_TYPE_AUTO + : BPF_PROG_LOAD_TYPE_DISABLED; } return 0; @@ -1202,7 +1203,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) * then bpf_object_adjust_struct_ops_autoload() will update its * autoload accordingly. */ - st_ops->progs[i]->autoload = false; + st_ops->progs[i]->load_type = BPF_PROG_LOAD_TYPE_DISABLED; st_ops->progs[i] = NULL; } @@ -1241,7 +1242,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) * if user replaced it with another program or NULL */ if (st_ops->progs[i] && st_ops->progs[i] != prog) - st_ops->progs[i]->autoload = false; + st_ops->progs[i]->load_type = BPF_PROG_LOAD_TYPE_DISABLED; /* Update the value from the shadow type */ st_ops->progs[i] = prog; @@ -3482,7 +3483,7 @@ static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) } bpf_object__for_each_program(prog, obj) { - if (!prog->autoload) + if (prog->load_type == BPF_PROG_LOAD_TYPE_DISABLED) continue; if (prog_needs_vmlinux_btf(prog)) return true; @@ -5973,7 +5974,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) /* no need to apply CO-RE relocation if the program is * not going to be loaded */ - if (!prog->autoload) + if (prog->load_type == BPF_PROG_LOAD_TYPE_DISABLED) continue; /* adjust insn_idx from section frame of reference to the local @@ -7106,7 +7107,7 @@ static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_pat */ if (prog_is_subprog(obj, prog)) continue; - if (!prog->autoload) + if (prog->load_type == BPF_PROG_LOAD_TYPE_DISABLED) continue; err = bpf_object__relocate_calls(obj, prog); @@ -7142,7 +7143,7 @@ static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_pat prog = &obj->programs[i]; if (prog_is_subprog(obj, prog)) continue; - if (!prog->autoload) + if (prog->load_type == BPF_PROG_LOAD_TYPE_DISABLED) continue; /* Process data relos for main program
Re: [RFC 1/5] vduse: add virtio_fs to allowed dev id
On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > A VDUSE device that implements virtiofs device works fine just by > adding the device id to the whitelist. > > Signed-off-by: Eugenio Pérez Acked-by: Jason Wang Thanks
Re: [RFC 3/5] virtiofs: Move stack sg to fuse_req
On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > We need to move the map and unmap out of virtiofs queue lock. We need to > store the unmap sgs to call virtqueue_unmap_sgs, so use the request > itself. > > TODO: We need to store the forget sgs too. In my tests it is not called > so we're safe. Find a way to force-call it. > > Signed-off-by: Eugenio Pérez > --- > fs/fuse/fuse_i.h| 7 +++ > fs/fuse/virtio_fs.c | 45 + > 2 files changed, 28 insertions(+), 24 deletions(-) > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 74744c6f2860..e57664daa761 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -18,6 +18,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -434,6 +435,12 @@ struct fuse_req { > #if IS_ENABLED(CONFIG_VIRTIO_FS) > /** virtio-fs's physically contiguous buffer for in and out args */ > void *argbuf; > + > + /** virtio-fs's pre-mapped stuff */ > + struct scatterlist sg_inline_data[6]; /* optimization for short > requests */ > + struct scatterlist *sg; > + unsigned int out_sgs; > + unsigned int in_sgs; > #endif As this touches FUSE, I think we need to do a careful benchmark just for this patch to make sure no regression is introduced. A dumb question (know little about FUSE), if this is only used by virtio-fs, could we find a way to embed fuse_req in a virtio-fs dedicated req structure? > > /** fuse_mount this request belongs to */ > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 82afe78ec542..1344c5782a7c 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -1377,14 +1377,10 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > *fsvq, > { > /* requests need at least 4 elements */ > struct scatterlist *stack_sgs[6]; > - struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; > struct scatterlist **sgs = stack_sgs; > - struct scatterlist *sg = stack_sg; > struct virtqueue *vq; > struct fuse_args *args = req->args; > unsigned int argbuf_used = 0; > - unsigned int out_sgs = 0; > - unsigned int in_sgs = 0; > unsigned int total_sgs; > unsigned int i; > int ret; > @@ -1392,11 +1388,13 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > *fsvq, > struct fuse_pqueue *fpq; > > /* Does the sglist fit on the stack? */ > + /* TODO replace magic 6 by a macro */ > + req->sg = req->sg_inline_data; > total_sgs = sg_count_fuse_req(req); > - if (total_sgs > ARRAY_SIZE(stack_sgs)) { > + if (total_sgs > 6) { > sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); > - sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); > - if (!sgs || !sg) { > + req->sg = kmalloc_array(total_sgs, sizeof(req->sg[0]), gfp); > + if (!sgs || !req->sg) { > ret = -ENOMEM; > goto out; > } > @@ -1408,26 +1406,25 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > *fsvq, > goto out; > > /* Request elements */ > - sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); > - out_sgs += sg_init_fuse_args(&sg[out_sgs], req, > -(struct fuse_arg *)args->in_args, > -args->in_numargs, args->in_pages, > -req->argbuf, &argbuf_used); > + sg_init_one(&req->sg[req->out_sgs++], &req->in.h, sizeof(req->in.h)); > + req->out_sgs += sg_init_fuse_args(&req->sg[req->out_sgs], req, > + (struct fuse_arg *)args->in_args, > + args->in_numargs, args->in_pages, > + req->argbuf, &argbuf_used); > > /* Reply elements */ > if (test_bit(FR_ISREPLY, &req->flags)) { > - sg_init_one(&sg[out_sgs + in_sgs++], > + sg_init_one(&req->sg[req->out_sgs + req->in_sgs++], > &req->out.h, sizeof(req->out.h)); > - in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, > - args->out_args, args->out_numargs, > - args->out_pages, > - req->argbuf + argbuf_used, NULL); > + req->in_sgs += sg_init_fuse_args(&req->sg[req->out_sgs + > req->in_sgs], req, > +args->out_args, > args->out_numargs, > +args->out_pages, > +req->argbuf + argbuf_used, > NULL); > } > > - WARN_ON(out_sgs + in_sgs != total_sgs); > - > for (i = 0; i < tota
Re: [RFC 5/5] virtiofs: perform DMA operations out of the spinlock
On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > This is useful for some setups like swiotlb or VDUSE where the DMA > operations are expensive and/or need to be performed with a write lock. > > After applying this patch, fio read test goes from 1201MiB/s to 1211MiB/s. The difference is too small to differentiate it from the noise. I would suggest to test with different setups. 1) SWIOTLB 2) VDUSE Note that SWIOTLB will do bouncing even for DMA_FROM_DEVICE, so I think we may see better performance there. And we need to try with different request size, I did a similar patch for virtio-blk and I see better performance for large request like 1M etc. Thanks > > Signed-off-by: Eugenio Pérez > --- > drivers/virtio/virtio_ring.c | 2 ++ > fs/fuse/virtio_fs.c | 25 +++-- > 2 files changed, 25 insertions(+), 2 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index e49912fa77c5..eb22bfcb9100 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -580,6 +580,7 @@ int virtqueue_map_sgs(struct virtqueue *_vq, > goto unmap_release; > > sg_dma_address(sg) = addr; > + sg_dma_len(sg) = sg->length; > mapped_sg++; > } > } > @@ -592,6 +593,7 @@ int virtqueue_map_sgs(struct virtqueue *_vq, > goto unmap_release; > > sg_dma_address(sg) = addr; > + sg_dma_len(sg) = sg->length; > mapped_sg++; > } > } > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 1344c5782a7c..2b558b05d0f8 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -836,8 +836,21 @@ static void virtio_fs_requests_done_work(struct > work_struct *work) > > /* End requests */ > list_for_each_entry_safe(req, next, &reqs, list) { > + struct scatterlist *stack_sgs[6]; > + struct scatterlist **sgs = stack_sgs; > + unsigned int total_sgs = req->out_sgs + req->in_sgs; > + > list_del_init(&req->list); > > + /* TODO replace magic 6 by a macro */ > + if (total_sgs > 6) > + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), > GFP_ATOMIC); > + > + for (unsigned int i = 0; i < total_sgs; ++i) > + sgs[i] = &req->sg[i]; > + > + virtqueue_unmap_sgs(vq, sgs, req->out_sgs, req->in_sgs); > + > /* blocking async request completes in a worker context */ > if (req->args->may_block) { > struct virtio_fs_req_work *w; > @@ -850,6 +863,9 @@ static void virtio_fs_requests_done_work(struct > work_struct *work) > } else { > virtio_fs_request_complete(req, fsvq); > } > + > + if (sgs != stack_sgs) > + kfree(sgs); > } > > /* Try to push previously queued requests, as the queue might no > longer be full */ > @@ -1426,6 +1442,11 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > *fsvq, > sgs[i] = &req->sg[i]; > WARN_ON(req->out_sgs + req->in_sgs != total_sgs); > > + // TODO can we change this ptr out of the lock? > + vq = fsvq->vq; > + // TODO handle this and following errors > + ret = virtqueue_map_sgs(vq, sgs, req->out_sgs, req->in_sgs); > + BUG_ON(ret < 0); > spin_lock(&fsvq->lock); > > if (!fsvq->connected) { > @@ -1434,8 +1455,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > *fsvq, > goto out; > } > > - vq = fsvq->vq; > - ret = virtqueue_add_sgs(vq, sgs, req->out_sgs, req->in_sgs, req, > GFP_ATOMIC); > + ret = virtqueue_add_sgs_premapped(vq, sgs, req->out_sgs, > + req->in_sgs, req, GFP_ATOMIC); > if (ret < 0) { > spin_unlock(&fsvq->lock); > goto out; > -- > 2.48.1 >
Re: [PATCH] vduse: add virtio_fs to allowed dev id
On Wed, Jan 22, 2025 at 11:49 PM Stefan Hajnoczi wrote: > > On Tue, Jan 21, 2025 at 11:33:46AM +0100, Eugenio Pérez wrote: > > A VDUSE device that implements virtiofs device works fine just by > > adding the device id to the whitelist. > > The virtiofs DAX Window feature might require extra work. It is not > widely enabled though, so must virtiofs devices will work fine with just > virtqueue and configuration space support. +1. The DAX Window may help to reduce the bounce buffer copying and fit for the case where an AI application just accesses one single large file. We need to consider implementing this via VDUSE. Thanks > > Reviewed-by: Stefan Hajnoczi
Re: [RFC 2/5] virtio_ring: introduce virtqueue_map/unmap_sgs()
On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > From: Jason Wang > > Introduce new virtqueue DMA operations which allows the drivers that > want to make use of the premapping API but operate at the sg level. > > Note that we still follow the assumtions if virtqueue_add() so > dma_map_sg() is not used. This could be optimized in the future. > Signed-off-by: Jason Wang > Signed-off-by: Eugenio Pérez > -- > Eugenio's changes: Remove blank > TODO: Should we call directly dma_map instead of this? XDP do the direct > call. Note that we should have an indirection layer as the device is not necessarily DMA capable. And we probably need to rename the virtqueue_dma_map_XXX() to virtqueue_map_XXX() Thanks > --- > drivers/virtio/virtio_ring.c | 128 +++ > include/linux/virtio.h | 10 +++ > 2 files changed, 125 insertions(+), 13 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index fdd2d2b07b5a..05729bc5cbb1 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -359,6 +359,26 @@ static struct device *vring_dma_dev(const struct > vring_virtqueue *vq) > return vq->dma_dev; > } > > +static int __vring_map_one_sg(const struct vring_virtqueue *vq, > + struct scatterlist *sg, > + enum dma_data_direction direction, > + dma_addr_t *addr) > +{ > + /* > +* We can't use dma_map_sg, because we don't use scatterlists in > +* the way it expects (we don't guarantee that the scatterlist > +* will exist for the lifetime of the mapping). > +*/ > + *addr = dma_map_page(vring_dma_dev(vq), > + sg_page(sg), sg->offset, sg->length, > + direction); > + > + if (dma_mapping_error(vring_dma_dev(vq), *addr)) > + return -ENOMEM; > + > + return 0; > +} > + > /* Map one sg entry. */ > static int vring_map_one_sg(const struct vring_virtqueue *vq, struct > scatterlist *sg, > enum dma_data_direction direction, dma_addr_t > *addr, > @@ -383,19 +403,7 @@ static int vring_map_one_sg(const struct vring_virtqueue > *vq, struct scatterlist > return 0; > } > > - /* > -* We can't use dma_map_sg, because we don't use scatterlists in > -* the way it expects (we don't guarantee that the scatterlist > -* will exist for the lifetime of the mapping). > -*/ > - *addr = dma_map_page(vring_dma_dev(vq), > - sg_page(sg), sg->offset, sg->length, > - direction); > - > - if (dma_mapping_error(vring_dma_dev(vq), *addr)) > - return -ENOMEM; > - > - return 0; > + return __vring_map_one_sg(vq, sg, direction, addr); > } > > static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, > @@ -526,6 +534,100 @@ static inline unsigned int > virtqueue_add_desc_split(struct virtqueue *vq, > return next; > } > > +void virtqueue_unmap_sgs(struct virtqueue *_vq, > +struct scatterlist *sgs[], > +unsigned int out_sgs, > +unsigned int in_sgs) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + struct scatterlist *sg; > + int n; > + > + for (n = 0; n < out_sgs; n++) { > + for (sg = sgs[n]; sg; sg = sg_next(sg)) { > + dma_unmap_page(vring_dma_dev(vq), > + sg_dma_address(sg), > + sg->length, > + DMA_TO_DEVICE); > + } > + } > + > + for (; n < (out_sgs + in_sgs); n++) { > + for (sg = sgs[n]; sg; sg = sg_next(sg)) { > + dma_unmap_page(vring_dma_dev(vq), > + sg_dma_address(sg), > + sg->length, > + DMA_FROM_DEVICE); > + } > + } > +} > +EXPORT_SYMBOL_GPL(virtqueue_unmap_sgs); > + > +int virtqueue_map_sgs(struct virtqueue *_vq, > + struct scatterlist *sgs[], > + unsigned int out_sgs, > + unsigned int in_sgs) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + int i, n, mapped_sg = 0; > + struct scatterlist *sg; > + > + for (n = 0; n < out_sgs; n++) { > + for (sg = sgs[n]; sg; sg = sg_next(sg)) { > + dma_addr_t addr; > + > + if (__vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) > + goto unmap_release; > + > + sg_dma_address(sg) = addr; > + mapped_sg++; > + } > + } > + > + for (; n < (out
Re: [PATCH] media: add virtio-media driver
Hi Alexandre, kernel test robot noticed the following build warnings: [auto build test WARNING on ffd294d346d185b70e28b1a28abe367bbfe53c04] url: https://github.com/intel-lab-lkp/linux/commits/Alexandre-Courbot/media-add-virtio-media-driver/20250123-085559 base: ffd294d346d185b70e28b1a28abe367bbfe53c04 patch link: https://lore.kernel.org/r/20250123-virtio-media-v1-1-81e2549b86b9%40gmail.com patch subject: [PATCH] media: add virtio-media driver reproduce: (https://download.01.org/0day-ci/archive/20250123/202501231055.jvzbf96y-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202501231055.jvzbf96y-...@intel.com/ versioncheck warnings: (new ones prefixed by >>) INFO PATH=/opt/cross/rustc-1.78.0-bindgen-0.65.1/cargo/bin:/opt/cross/clang-19/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin /usr/bin/timeout -k 100 3h /usr/bin/make KCFLAGS= -Wtautological-compare -Wno-error=return-type -Wreturn-type -Wcast-function-type -funsigned-char -Wundef -fstrict-flex-arrays=3 -Wformat-overflow -Wformat-truncation -Wenum-conversion W=1 --keep-going LLVM=1 -j32 ARCH=x86_64 versioncheck find ./* \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS -o -name .pc -o -name .hg -o -name .git \) -prune -o \ -name '*.[hcS]' -type f -print | sort \ | xargs perl -w ./scripts/checkversion.pl >> ./drivers/media/virtio/virtio_media_driver.c: 22 linux/version.h not needed. ./drivers/staging/gpib/lpvo_usb_gpib/lpvo_usb_gpib.c: 13 linux/version.h not needed. ./samples/bpf/spintest.bpf.c: 8 linux/version.h not needed. ./tools/lib/bpf/bpf_helpers.h: 424: need linux/version.h ./tools/testing/selftests/bpf/progs/dev_cgroup.c: 9 linux/version.h not needed. ./tools/testing/selftests/bpf/progs/netcnt_prog.c: 3 linux/version.h not needed. ./tools/testing/selftests/bpf/progs/test_map_lock.c: 4 linux/version.h not needed. ./tools/testing/selftests/bpf/progs/test_send_signal_kern.c: 4 linux/version.h not needed. ./tools/testing/selftests/bpf/progs/test_spin_lock.c: 4 linux/version.h not needed. ./tools/testing/selftests/bpf/progs/test_tcp_estats.c: 37 linux/version.h not needed. ./tools/testing/selftests/wireguard/qemu/init.c: 27 linux/version.h not needed. -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Re: [RFC 5/5] virtiofs: perform DMA operations out of the spinlock
On Thu, Jan 23, 2025 at 9:56 AM Jason Wang wrote: > > On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > > > This is useful for some setups like swiotlb or VDUSE where the DMA > > operations are expensive and/or need to be performed with a write lock. > > > > After applying this patch, fio read test goes from 1201MiB/s to 1211MiB/s. > > The difference is too small to differentiate it from the noise. > > I would suggest to test with different setups. > > 1) SWIOTLB 2) VDUSE > > Note that SWIOTLB will do bouncing even for DMA_FROM_DEVICE, I meant dma map in this case actually. Thanks > so I > think we may see better performance there. > > And we need to try with different request size, I did a similar patch > for virtio-blk and I see better performance for large request like 1M > etc. > > Thanks > > > > > Signed-off-by: Eugenio Pérez > > --- > > drivers/virtio/virtio_ring.c | 2 ++ > > fs/fuse/virtio_fs.c | 25 +++-- > > 2 files changed, 25 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > index e49912fa77c5..eb22bfcb9100 100644 > > --- a/drivers/virtio/virtio_ring.c > > +++ b/drivers/virtio/virtio_ring.c > > @@ -580,6 +580,7 @@ int virtqueue_map_sgs(struct virtqueue *_vq, > > goto unmap_release; > > > > sg_dma_address(sg) = addr; > > + sg_dma_len(sg) = sg->length; > > mapped_sg++; > > } > > } > > @@ -592,6 +593,7 @@ int virtqueue_map_sgs(struct virtqueue *_vq, > > goto unmap_release; > > > > sg_dma_address(sg) = addr; > > + sg_dma_len(sg) = sg->length; > > mapped_sg++; > > } > > } > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > > index 1344c5782a7c..2b558b05d0f8 100644 > > --- a/fs/fuse/virtio_fs.c > > +++ b/fs/fuse/virtio_fs.c > > @@ -836,8 +836,21 @@ static void virtio_fs_requests_done_work(struct > > work_struct *work) > > > > /* End requests */ > > list_for_each_entry_safe(req, next, &reqs, list) { > > + struct scatterlist *stack_sgs[6]; > > + struct scatterlist **sgs = stack_sgs; > > + unsigned int total_sgs = req->out_sgs + req->in_sgs; > > + > > list_del_init(&req->list); > > > > + /* TODO replace magic 6 by a macro */ > > + if (total_sgs > 6) > > + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), > > GFP_ATOMIC); > > + > > + for (unsigned int i = 0; i < total_sgs; ++i) > > + sgs[i] = &req->sg[i]; > > + > > + virtqueue_unmap_sgs(vq, sgs, req->out_sgs, req->in_sgs); > > + > > /* blocking async request completes in a worker context */ > > if (req->args->may_block) { > > struct virtio_fs_req_work *w; > > @@ -850,6 +863,9 @@ static void virtio_fs_requests_done_work(struct > > work_struct *work) > > } else { > > virtio_fs_request_complete(req, fsvq); > > } > > + > > + if (sgs != stack_sgs) > > + kfree(sgs); > > } > > > > /* Try to push previously queued requests, as the queue might no > > longer be full */ > > @@ -1426,6 +1442,11 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > > *fsvq, > > sgs[i] = &req->sg[i]; > > WARN_ON(req->out_sgs + req->in_sgs != total_sgs); > > > > + // TODO can we change this ptr out of the lock? > > + vq = fsvq->vq; > > + // TODO handle this and following errors > > + ret = virtqueue_map_sgs(vq, sgs, req->out_sgs, req->in_sgs); > > + BUG_ON(ret < 0); > > spin_lock(&fsvq->lock); > > > > if (!fsvq->connected) { > > @@ -1434,8 +1455,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq > > *fsvq, > > goto out; > > } > > > > - vq = fsvq->vq; > > - ret = virtqueue_add_sgs(vq, sgs, req->out_sgs, req->in_sgs, req, > > GFP_ATOMIC); > > + ret = virtqueue_add_sgs_premapped(vq, sgs, req->out_sgs, > > + req->in_sgs, req, GFP_ATOMIC); > > if (ret < 0) { > > spin_unlock(&fsvq->lock); > > goto out; > > -- > > 2.48.1 > >
Re: [RFC net-next v3 2/4] virtio_net: Prepare for NAPI to queue mapping
On Thu, Jan 23, 2025 at 10:40:43AM +0800, Jason Wang wrote: > On Thu, Jan 23, 2025 at 1:41 AM Joe Damato wrote: > > > > On Wed, Jan 22, 2025 at 02:12:46PM +0800, Jason Wang wrote: > > > On Wed, Jan 22, 2025 at 3:11 AM Joe Damato wrote: > > > > > > > > Slight refactor to prepare the code for NAPI to queue mapping. No > > > > functional changes. > > > > > > > > Signed-off-by: Joe Damato > > > > Reviewed-by: Gerhard Engleder > > > > Tested-by: Lei Yang > > > > --- > > > > v2: > > > >- Previously patch 1 in the v1. > > > >- Added Reviewed-by and Tested-by tags to commit message. No > > > > functional changes. > > > > > > > > drivers/net/virtio_net.c | 10 -- > > > > 1 file changed, 8 insertions(+), 2 deletions(-) > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > index 7646ddd9bef7..cff18c66b54a 100644 > > > > --- a/drivers/net/virtio_net.c > > > > +++ b/drivers/net/virtio_net.c > > > > @@ -2789,7 +2789,8 @@ static void skb_recv_done(struct virtqueue *rvq) > > > > virtqueue_napi_schedule(&rq->napi, rvq); > > > > } > > > > > > > > -static void virtnet_napi_enable(struct virtqueue *vq, struct > > > > napi_struct *napi) > > > > +static void virtnet_napi_do_enable(struct virtqueue *vq, > > > > + struct napi_struct *napi) > > > > { > > > > napi_enable(napi); > > > > > > Nit: it might be better to not have this helper to avoid a misuse of > > > this function directly. > > > > Sorry, I'm probably missing something here. > > > > Both virtnet_napi_enable and virtnet_napi_tx_enable need the logic > > in virtnet_napi_do_enable. > > > > Are you suggesting that I remove virtnet_napi_do_enable and repeat > > the block of code in there twice (in virtnet_napi_enable and > > virtnet_napi_tx_enable)? > > I think I miss something here, it looks like virtnet_napi_tx_enable() > calls virtnet_napi_do_enable() directly. > > I would like to know why we don't call netif_queue_set_napi() for TX NAPI > here? Please see both the cover letter and the commit message of the next commit which addresses this question. TX-only NAPIs do not have NAPI IDs so there is nothing to map.
Re: [PATCH 1/3] remoteproc: k3-r5: Fix checks in k3_r5_rproc_{mbox_callback/kick}
On 22/01/25 00:17, Andrew Davis wrote: On 12/24/24 3:14 AM, Beleswar Padhi wrote: Commit f3f11cfe8907 ("remoteproc: k3-r5: Acquire mailbox handle during probe routine") introduced a check in the "k3_r5_rproc_mbox_callback()" and "k3_r5_rproc_kick()" callbacks to exit if the remote core's state was "RPROC_DETACHED". However, this caused issues in IPC-only mode, as the default state of the core is set to "RPROC_DETACHED", and the transition to "RPROC_ATTACHED" happens only after the "__rproc_attach()" function has invoked "rproc_start_subdevices()". Sounds like the core issue was making assumptions about the state of a variable that is usually only used internally by the rproc core (rproc->state). Yes that is right, so we now combined internal rproc state (is_attach_ongoing) with core framework's rproc->state. Instead, what would be the harm in just dropping the state check? Messages coming from a detached core should be processed the same. Taking a look at k3_r5/dsp_rproc_mbox_callback(), today we don't really do much other than just log a print to console when we receive non-virtio mailbox messages. If we get a virtio mailbox message from a detached core, that will anyways be dropped in the further layers. So, I am okay with removing these checks entirely. These additional checks would be needed when we decide to do something more than just logging prints, a problem for another day though. I will respin the series with these checks dropped. Thanks, Beleswar Andrew The "rproc_start_subdevices()" function triggers the probe of Virtio RPMsg subdevices, which require the mailbox callbacks to be functional. To resolve this, a new variable, "is_attach_ongoing", is introduced to distinguish between core states: when a core is actually detached and when it is in the process of being attached. The callbacks are updated to return early only if the core is actually detached and not during an ongoing attach operation in IPC-only mode. Reported-by: Siddharth Vadapalli Closes: https://lore.kernel.org/all/20240916083131.2801755-1-s-vadapa...@ti.com/ Fixes: f3f11cfe8907 ("remoteproc: k3-r5: Acquire mailbox handle during probe routine") Signed-off-by: Beleswar Padhi --- Link to RFC version: https://lore.kernel.org/all/20240916083131.2801755-1-s-vadapa...@ti.com/ Improvements in v1: 1. Ensured these mbox callbacks are functional when the core is in the proccess of getting attached in IPC-Only mode. 2. Ensured these mbox callbacks are _not_ functional when the core state is actually detached. drivers/remoteproc/ti_k3_r5_remoteproc.c | 53 +--- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c b/drivers/remoteproc/ti_k3_r5_remoteproc.c index dbc513c5569c..e218a803fdb5 100644 --- a/drivers/remoteproc/ti_k3_r5_remoteproc.c +++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c @@ -131,6 +131,7 @@ struct k3_r5_cluster { * @btcm_enable: flag to control BTCM enablement * @loczrama: flag to dictate which TCM is at device address 0x0 * @released_from_reset: flag to signal when core is out of reset + * @is_attach_ongoing: flag to indicate if IPC-only "attach()" is in progress */ struct k3_r5_core { struct list_head elem; @@ -148,6 +149,7 @@ struct k3_r5_core { u32 btcm_enable; u32 loczrama; bool released_from_reset; + bool is_attach_ongoing; }; /** @@ -194,8 +196,11 @@ static void k3_r5_rproc_mbox_callback(struct mbox_client *client, void *data) const char *name = kproc->rproc->name; u32 msg = omap_mbox_message(data); - /* Do not forward message from a detached core */ - if (kproc->rproc->state == RPROC_DETACHED) + /* + * Do not forward messages from a detached core, except when the core + * is in the process of being attached in IPC-only mode. + */ + if (!kproc->core->is_attach_ongoing && kproc->rproc->state == RPROC_DETACHED) return; dev_dbg(dev, "mbox msg: 0x%x\n", msg); @@ -233,8 +238,11 @@ static void k3_r5_rproc_kick(struct rproc *rproc, int vqid) mbox_msg_t msg = (mbox_msg_t)vqid; int ret; - /* Do not forward message to a detached core */ - if (kproc->rproc->state == RPROC_DETACHED) + /* + * Do not forward messages to a detached core, except when the core is + * in the process of being attached in IPC-only mode. + */ + if (!kproc->core->is_attach_ongoing && kproc->rproc->state == RPROC_DETACHED) return; /* send the index of the triggered virtqueue in the mailbox payload */ @@ -671,22 +679,39 @@ static int k3_r5_rproc_stop(struct rproc *rproc) /* * Attach to a running R5F remote processor (IPC-only mode) * - * The R5F attach callback is a NOP. The remote processor is already booted, and - * all required resources have been acquired during probe routine, so there is - * no need to issue any TI-SCI commands to boot the R5F cores in
[PATCH] wireguard: selftests: Cleanup CONFIG_UBSAN_SANITIZE_ALL
Commit 918327e9b7ff ("ubsan: Remove CONFIG_UBSAN_SANITIZE_ALL") removed the CONFIG_UBSAN_SANITIZE_ALL configuration option. Eliminate invalid configurations to improve code readability. Signed-off-by: WangYuli --- tools/testing/selftests/wireguard/qemu/debug.config | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 139fd9aa8b12..828f14300d0a 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -22,7 +22,6 @@ CONFIG_HAVE_ARCH_KASAN=y CONFIG_KASAN=y CONFIG_KASAN_INLINE=y CONFIG_UBSAN=y -CONFIG_UBSAN_SANITIZE_ALL=y CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_SHIRQ=y -- 2.45.2
Re: [RFC net-next v3 2/4] virtio_net: Prepare for NAPI to queue mapping
On Thu, Jan 23, 2025 at 1:41 AM Joe Damato wrote: > > On Wed, Jan 22, 2025 at 02:12:46PM +0800, Jason Wang wrote: > > On Wed, Jan 22, 2025 at 3:11 AM Joe Damato wrote: > > > > > > Slight refactor to prepare the code for NAPI to queue mapping. No > > > functional changes. > > > > > > Signed-off-by: Joe Damato > > > Reviewed-by: Gerhard Engleder > > > Tested-by: Lei Yang > > > --- > > > v2: > > >- Previously patch 1 in the v1. > > >- Added Reviewed-by and Tested-by tags to commit message. No > > > functional changes. > > > > > > drivers/net/virtio_net.c | 10 -- > > > 1 file changed, 8 insertions(+), 2 deletions(-) > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > index 7646ddd9bef7..cff18c66b54a 100644 > > > --- a/drivers/net/virtio_net.c > > > +++ b/drivers/net/virtio_net.c > > > @@ -2789,7 +2789,8 @@ static void skb_recv_done(struct virtqueue *rvq) > > > virtqueue_napi_schedule(&rq->napi, rvq); > > > } > > > > > > -static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct > > > *napi) > > > +static void virtnet_napi_do_enable(struct virtqueue *vq, > > > + struct napi_struct *napi) > > > { > > > napi_enable(napi); > > > > Nit: it might be better to not have this helper to avoid a misuse of > > this function directly. > > Sorry, I'm probably missing something here. > > Both virtnet_napi_enable and virtnet_napi_tx_enable need the logic > in virtnet_napi_do_enable. > > Are you suggesting that I remove virtnet_napi_do_enable and repeat > the block of code in there twice (in virtnet_napi_enable and > virtnet_napi_tx_enable)? I think I miss something here, it looks like virtnet_napi_tx_enable() calls virtnet_napi_do_enable() directly. I would like to know why we don't call netif_queue_set_napi() for TX NAPI here? Thanks > > Just seemed like a lot of code to repeat twice and a helper would be > cleaner? > > Let me know; since net-next is closed there is a plenty of time to > get this to where you'd like it to be before new code is accepted. > > > Other than this. > > > > Acked-by: Jason Wang > > Thanks. >
Re: [PATCH] vduse: add virtio_fs to allowed dev id
On Thu, Jan 23, 2025 at 2:50 AM Jason Wang wrote: > > On Wed, Jan 22, 2025 at 11:49 PM Stefan Hajnoczi wrote: > > > > On Tue, Jan 21, 2025 at 11:33:46AM +0100, Eugenio Pérez wrote: > > > A VDUSE device that implements virtiofs device works fine just by > > > adding the device id to the whitelist. > > > > The virtiofs DAX Window feature might require extra work. It is not > > widely enabled though, so must virtiofs devices will work fine with just > > virtqueue and configuration space support. > > +1. > > The DAX Window may help to reduce the bounce buffer copying and fit > for the case where an AI application just accesses one single large > file. > > We need to consider implementing this via VDUSE. > > Thanks > > I thought DAX can only be applied in the VM case. Is it possible to apply it using virtio_vdpa? Is it possible to detect that the device is trying to use DAX so it fails the negotiation?
Re: [RFC 2/5] virtio_ring: introduce virtqueue_map/unmap_sgs()
On Thu, Jan 23, 2025 at 2:51 AM Jason Wang wrote: > > On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > > > From: Jason Wang > > > > Introduce new virtqueue DMA operations which allows the drivers that > > want to make use of the premapping API but operate at the sg level. > > > > Note that we still follow the assumtions if virtqueue_add() so > > dma_map_sg() is not used. This could be optimized in the future. > > Signed-off-by: Jason Wang > > Signed-off-by: Eugenio Pérez > > -- > > Eugenio's changes: Remove blank > > TODO: Should we call directly dma_map instead of this? XDP do the direct > > call. > > Note that we should have an indirection layer as the device is not > necessarily DMA capable. > Can you expand on this? virtqueue_map_sgs calls directly to dma_map_page for each entry in the sg. This indirection layer should be in the device's callback for DMA, isn't it? > And we probably need to rename the virtqueue_dma_map_XXX() to > virtqueue_map_XXX() > Ok so maybe these functions are already enough without using dma_map_sgs. I"m ok with renaming these, but that should be done in a separate series, isn't it? Thanks!
Re: [RFC 3/5] virtiofs: Move stack sg to fuse_req
On Thu, Jan 23, 2025 at 2:54 AM Jason Wang wrote: > > On Thu, Jan 23, 2025 at 12:32 AM Eugenio Pérez wrote: > > > > We need to move the map and unmap out of virtiofs queue lock. We need to > > store the unmap sgs to call virtqueue_unmap_sgs, so use the request > > itself. > > > > TODO: We need to store the forget sgs too. In my tests it is not called > > so we're safe. Find a way to force-call it. > > > > Signed-off-by: Eugenio Pérez > > --- > > fs/fuse/fuse_i.h| 7 +++ > > fs/fuse/virtio_fs.c | 45 + > > 2 files changed, 28 insertions(+), 24 deletions(-) > > > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > > index 74744c6f2860..e57664daa761 100644 > > --- a/fs/fuse/fuse_i.h > > +++ b/fs/fuse/fuse_i.h > > @@ -18,6 +18,7 @@ > > #include > > #include > > #include > > +#include > > #include > > #include > > #include > > @@ -434,6 +435,12 @@ struct fuse_req { > > #if IS_ENABLED(CONFIG_VIRTIO_FS) > > /** virtio-fs's physically contiguous buffer for in and out args */ > > void *argbuf; > > + > > + /** virtio-fs's pre-mapped stuff */ > > + struct scatterlist sg_inline_data[6]; /* optimization for short > > requests */ > > + struct scatterlist *sg; > > + unsigned int out_sgs; > > + unsigned int in_sgs; > > #endif > > As this touches FUSE, I think we need to do a careful benchmark just > for this patch to make sure no regression is introduced. > Sure, I can do it. But the struct already has a virtiofs field (argbuf). > A dumb question (know little about FUSE), if this is only used by > virtio-fs, could we find a way to embed fuse_req in a virtio-fs > dedicated req structure? > I guess FUSE can ask virtiofs to allocate the struct for sure. > > > > /** fuse_mount this request belongs to */ > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > > index 82afe78ec542..1344c5782a7c 100644 > > --- a/fs/fuse/virtio_fs.c > > +++ b/fs/fuse/virtio_fs.c > > @@ -1377,14 +1377,10 @@ static int virtio_fs_enqueue_req(struct > > virtio_fs_vq *fsvq, > > { > > /* requests need at least 4 elements */ > > struct scatterlist *stack_sgs[6]; > > - struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; > > struct scatterlist **sgs = stack_sgs; > > - struct scatterlist *sg = stack_sg; > > struct virtqueue *vq; > > struct fuse_args *args = req->args; > > unsigned int argbuf_used = 0; > > - unsigned int out_sgs = 0; > > - unsigned int in_sgs = 0; > > unsigned int total_sgs; > > unsigned int i; > > int ret; > > @@ -1392,11 +1388,13 @@ static int virtio_fs_enqueue_req(struct > > virtio_fs_vq *fsvq, > > struct fuse_pqueue *fpq; > > > > /* Does the sglist fit on the stack? */ > > + /* TODO replace magic 6 by a macro */ > > + req->sg = req->sg_inline_data; > > total_sgs = sg_count_fuse_req(req); > > - if (total_sgs > ARRAY_SIZE(stack_sgs)) { > > + if (total_sgs > 6) { > > sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); > > - sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); > > - if (!sgs || !sg) { > > + req->sg = kmalloc_array(total_sgs, sizeof(req->sg[0]), gfp); > > + if (!sgs || !req->sg) { > > ret = -ENOMEM; > > goto out; > > } > > @@ -1408,26 +1406,25 @@ static int virtio_fs_enqueue_req(struct > > virtio_fs_vq *fsvq, > > goto out; > > > > /* Request elements */ > > - sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); > > - out_sgs += sg_init_fuse_args(&sg[out_sgs], req, > > -(struct fuse_arg *)args->in_args, > > -args->in_numargs, args->in_pages, > > -req->argbuf, &argbuf_used); > > + sg_init_one(&req->sg[req->out_sgs++], &req->in.h, > > sizeof(req->in.h)); > > + req->out_sgs += sg_init_fuse_args(&req->sg[req->out_sgs], req, > > + (struct fuse_arg *)args->in_args, > > + args->in_numargs, args->in_pages, > > + req->argbuf, &argbuf_used); > > > > /* Reply elements */ > > if (test_bit(FR_ISREPLY, &req->flags)) { > > - sg_init_one(&sg[out_sgs + in_sgs++], > > + sg_init_one(&req->sg[req->out_sgs + req->in_sgs++], > > &req->out.h, sizeof(req->out.h)); > > - in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, > > - args->out_args, > > args->out_numargs, > > - args->out_pages, > > - req->argbuf + argbuf_used, > > NULL); > > +
[PATCH v2 1/5] selftests/nolibc: drop custom EXTRACONFIG functionality
kbuild already contains logic to merge predefines snippets into a defconfig file. This already works nicely with the current "defconfig" target. Make use of the snippet and drop the custom logic. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7d14a7c0cb62608f328b251495264517d333db2e..ba044c8a042ce345ff90bdd35569de4b5acd117d 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -82,7 +82,7 @@ DEFCONFIG_x86= defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm= multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig -DEFCONFIG_mips32be = malta_defconfig +DEFCONFIG_mips32be = malta_defconfig generic/eb.config DEFCONFIG_ppc= pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le= powernv_defconfig @@ -93,9 +93,6 @@ DEFCONFIG_s390 = defconfig DEFCONFIG_loongarch = defconfig DEFCONFIG= $(DEFCONFIG_$(XARCH)) -EXTRACONFIG_mips32be = -d CONFIG_CPU_LITTLE_ENDIAN -e CONFIG_CPU_BIG_ENDIAN -EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) - # optional tests to run (default = all) TEST = @@ -265,10 +262,6 @@ initramfs: nolibc-test defconfig: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare - $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ - $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ - $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ - fi kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -- 2.48.1
[PATCH v2 0/5] selftests/nolibc: test kernel configuration cleanups
A few cleanups and optimizations for the management of the kernel configuration. Signed-off-by: Thomas Weißschuh --- Changes in v2: - Rebase on current torvalds/master - Call "make defconfig" separately - Link to v1: https://lore.kernel.org/r/20250122-nolibc-config-v1-0-a697db968...@weissschuh.net --- Thomas Weißschuh (5): selftests/nolibc: drop custom EXTRACONFIG functionality selftests/nolibc: drop call to prepare target selftests/nolibc: drop call to mrproper target selftests/nolibc: execute defconfig before other targets selftests/nolibc: always keep test kernel configuration up to date tools/testing/selftests/nolibc/Makefile | 17 + tools/testing/selftests/nolibc/run-tests.sh | 7 +++ 2 files changed, 8 insertions(+), 16 deletions(-) --- base-commit: 21266b8df5224c4f677acf9f353eecc9094731f0 change-id: 20250122-nolibc-config-d639e1612c93 Best regards, -- Thomas Weißschuh
[PATCH v2 3/5] selftests/nolibc: drop call to mrproper target
"mrproper" unnecessarily cleans a lot of files. kbuild is smart enough to handle changed configurations, so the cleanup is not necessary and only leads to excessive rebuilds. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 464165e3d9175d283ec0ed14765df29427b6de38..d3afb71b4c6b7fc51b89f034c826692e76122864 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -261,7 +261,7 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -- 2.48.1
[PATCH v2 5/5] selftests/nolibc: always keep test kernel configuration up to date
Avoid using a stale test kernel configuration by always synchronizing it to the current source tree. kbuild is smart enough to avoid spurious rebuilds. Shuffle the code around a bit to keep all the commands with side-effects together. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/run-tests.sh | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 9c5160c5388122deeeb59ecfced7633000d69b10..bc4e92b4f1b98278a0a72345a5cd67f1a429b6a2 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -157,10 +157,6 @@ test_arch() { fi MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") - mkdir -p "$build_dir" - if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then - swallow_output "${MAKE[@]}" defconfig - fi case "$test_mode" in 'system') test_target=run @@ -173,6 +169,9 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" + + mkdir -p "$build_dir" + swallow_output "${MAKE[@]}" defconfig swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed -- 2.48.1
[PATCH v2 2/5] selftests/nolibc: drop call to prepare target
The "prepare" target does not need to be run manually. kbuild knows when to use it on its own and the target is not even documented. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index ba044c8a042ce345ff90bdd35569de4b5acd117d..464165e3d9175d283ec0ed14765df29427b6de38 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -261,7 +261,7 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) kernel: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -- 2.48.1
[PATCH v2 4/5] selftests/nolibc: execute defconfig before other targets
Some targets use the test kernel configuration. Executing defconfig in the same make invocation as those targets results in errors as the configuration may be in an inconsistent state during reconfiguration. Avoid this by introducing ordering dependencies between the defconfig and some other targets. Signed-off-by: Thomas Weißschuh --- tools/testing/selftests/nolibc/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index d3afb71b4c6b7fc51b89f034c826692e76122864..b74fa74e5ce296f032bec76ce9b3f5a3debe2b40 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -217,7 +217,7 @@ all: run sysroot: sysroot/$(ARCH)/include -sysroot/$(ARCH)/include: +sysroot/$(ARCH)/include: | defconfig $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot $(Q)$(MAKE) -C $(srctree) outputmakefile @@ -263,10 +263,10 @@ initramfs: nolibc-test defconfig: $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) -kernel: +kernel: | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -kernel-standalone: initramfs +kernel-standalone: initramfs | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel -- 2.48.1
Re: [PATCH net-next v18 20/25] ovpn: implement peer add/get/dump/delete via netlink
2025-01-22, 00:26:50 +0100, Antonio Quartulli wrote: > On 20/01/2025 15:52, Antonio Quartulli wrote: > > On 17/01/2025 12:48, Sabrina Dubroca wrote: > > [...] > > > 8< > > > > > > diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c > > > index 72357bb5f30b..19aa4ee6d468 100644 > > > --- a/drivers/net/ovpn/netlink.c > > > +++ b/drivers/net/ovpn/netlink.c > > > @@ -733,6 +733,9 @@ int ovpn_nl_peer_del_doit(struct sk_buff *skb, > > > struct genl_info *info) > > > netdev_dbg(ovpn->dev, "del peer %u\n", peer->id); > > > ret = ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_USERSPACE); > > > + if (ret >= 0 && peer->sock) > > > + wait_for_completion(&peer->sock_detach); > > > + > > > ovpn_peer_put(peer); > > > return ret; > > > diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c > > > index b032390047fe..6120521d0c32 100644 > > > --- a/drivers/net/ovpn/peer.c > > > +++ b/drivers/net/ovpn/peer.c > > > @@ -92,6 +92,7 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv > > > *ovpn, u32 id) > > > ovpn_peer_stats_init(&peer->vpn_stats); > > > ovpn_peer_stats_init(&peer->link_stats); > > > INIT_WORK(&peer->keepalive_work, ovpn_peer_keepalive_send); > > > + init_completion(&peer->sock_detach); > > > ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL); > > > if (ret < 0) { > > > diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h > > > index 7a062cc5a5a4..8c54bf5709ef 100644 > > > --- a/drivers/net/ovpn/peer.h > > > +++ b/drivers/net/ovpn/peer.h > > > @@ -112,6 +112,7 @@ struct ovpn_peer { > > > struct rcu_head rcu; > > > struct work_struct remove_work; > > > struct work_struct keepalive_work; > > > + struct completion sock_detach; > > > }; > > > /** > > > diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c > > > index a5c3bc834a35..7cefac42c3be 100644 > > > --- a/drivers/net/ovpn/socket.c > > > +++ b/drivers/net/ovpn/socket.c > > > @@ -31,6 +31,8 @@ static void ovpn_socket_release_kref(struct kref *kref) > > > sockfd_put(sock->sock); > > > kfree_rcu(sock, rcu); > > > + > > > + complete(&sock->peer->sock_detach); > > I am moving this line to ovpn_socket_put(), right after kref_put() so that > every peer going through the socket release will get their complete() > invoked. > > If the peer happens to be the last one owning the socket, kref_put() will > first do the detach and only then complete() gets called. > > This requires ovpn_socket_release/put() to take the peer as argument, but > that's ok. > > This way we should achieve what we needed. This seems like it would, thanks. -- Sabrina
Re: [PATCH net-next v18 20/25] ovpn: implement peer add/get/dump/delete via netlink
2025-01-22, 01:40:47 +0100, Antonio Quartulli wrote: > On 17/01/2025 12:48, Sabrina Dubroca wrote: > [...] > > With the delayed socket release (which is similar to what was in v11, > > but now with refcounting on the netdevice which should make > > rtnl_link_unregister in ovpn_cleanup wait [*]), we may return to > > userspace as if the peer was gone, but the socket hasn't been detached > > yet. > > > > A userspace application that tries to remove the peer and immediately > > re-create it with the same socket could get EBUSY if the workqueue > > hasn't done its job yet. That would be quite confusing to the > > application. > > > > So I would add a completion to wait here until the socket has been > > fully detached. Something like below. > > > > [*] I don't think the current refcounting fully protects against that, > > I'll comment on 05/25 > > Sabrina, after the other changes I acknowledged, do you still have comments > for 5/25? The call_rcu vs _put was all I had for this. Note that you have to wait until ~Feb 4th before you can resubmit (since net-next is currently closed). I'll take another look at this revision next week, since I've only checked a few specific things (mainly related to peer and socket destruction) so far. Thanks. -- Sabrina
Re: [PATCH net-next v18 20/25] ovpn: implement peer add/get/dump/delete via netlink
On 22/01/2025 09:51, Sabrina Dubroca wrote: 2025-01-22, 01:40:47 +0100, Antonio Quartulli wrote: On 17/01/2025 12:48, Sabrina Dubroca wrote: [...] With the delayed socket release (which is similar to what was in v11, but now with refcounting on the netdevice which should make rtnl_link_unregister in ovpn_cleanup wait [*]), we may return to userspace as if the peer was gone, but the socket hasn't been detached yet. A userspace application that tries to remove the peer and immediately re-create it with the same socket could get EBUSY if the workqueue hasn't done its job yet. That would be quite confusing to the application. So I would add a completion to wait here until the socket has been fully detached. Something like below. [*] I don't think the current refcounting fully protects against that, I'll comment on 05/25 Sabrina, after the other changes I acknowledged, do you still have comments for 5/25? The call_rcu vs _put was all I had for this. Ok! Note that you have to wait until ~Feb 4th before you can resubmit (since net-next is currently closed). I'll take another look at this revision next week, since I've only checked a few specific things (mainly related to peer and socket destruction) so far. Alright! In the meantime I'll take these days to perform more tests (possibly extending the selftest suite a little bit). If you want to compare this patchset with the very latest code (including your suggestions), you can check the main branch at: https://github.com/openvpn/linux-kernel-ovpn Thanks a lot! Best Regards, -- Antonio Quartulli OpenVPN Inc.