Re: [PATCH bpf-next v6 4/5] bpf: verifier: Support eliding map lookup nullness
On Thu, 2024-12-19 at 21:09 -0700, Daniel Xu wrote: lgtm, but please see a note below. [...] > +/* Returns constant key value if possible, else negative error */ > +static s64 get_constant_map_key(struct bpf_verifier_env *env, > + struct bpf_reg_state *key, > + u32 key_size) > +{ > + struct bpf_func_state *state = func(env, key); > + struct bpf_reg_state *reg; > + int slot, spi, off; > + int spill_size = 0; > + int zero_size = 0; > + int stack_off; > + int i, err; > + u8 *stype; > + > + if (!env->bpf_capable) > + return -EOPNOTSUPP; > + if (key->type != PTR_TO_STACK) > + return -EOPNOTSUPP; > + if (!tnum_is_const(key->var_off)) > + return -EOPNOTSUPP; > + > + stack_off = key->off + key->var_off.value; > + slot = -stack_off - 1; > + spi = slot / BPF_REG_SIZE; > + off = slot % BPF_REG_SIZE; > + stype = state->stack[spi].slot_type; > + > + /* First handle precisely tracked STACK_ZERO */ > + for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--) > + zero_size++; > + if (zero_size >= key_size) > + return 0; > + > + /* Check that stack contains a scalar spill of expected size */ > + if (!is_spilled_scalar_reg(&state->stack[spi])) > + return -EOPNOTSUPP; > + for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--) > + spill_size++; > + if (spill_size != key_size) > + return -EOPNOTSUPP; > + > + reg = &state->stack[spi].spilled_ptr; > + if (!tnum_is_const(reg->var_off)) > + /* Stack value not statically known */ > + return -EOPNOTSUPP; > + > + /* We are relying on a constant value. So mark as precise > + * to prevent pruning on it. > + */ > + bt_set_frame_slot(&env->bt, env->cur_state->curframe, spi); I think env->cur_state->curframe is not always correct here. It should be key->frameno, as key might point a few stack frames up. > + err = mark_chain_precision_batch(env); > + if (err < 0) > + return err; > + > + return reg->var_off.value; > +} > + > static int check_func_arg(struct bpf_verifier_env *env, u32 arg, > struct bpf_call_arg_meta *meta, > const struct bpf_func_proto *fn, [...]
Re: [PATCH v7 2/2] selftests: tmpfs: Add kselftest support to tmpfs
On 1/2/25 03:04, Shivam Chaudhary wrote: Replace direct error handling with 'ksft_test_result_*' macros for better reporting. Test logs: Before change: - Without root error: unshare, errno 1 - With root No, output After change: - Without root TAP version 13 1..1 ok 2 # SKIP This test needs root to run! Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 - With root TAP version 13 1..1 ok 1 Test : Success Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Shivam Chaudhary --- .../selftests/tmpfs/bug-link-o-tmpfile.c | 28 +-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c index 657b64857e82..290f11a81d2b 100644 --- a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c +++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c @@ -41,39 +41,39 @@ int main(void) if (unshare(CLONE_NEWNS) == -1) { if (errno == ENOSYS || errno == EPERM) { - fprintf(stderr, "error: unshare, errno %d\n", errno); - return 4; + ksft_exit_skip("unshare() error: unshare, errno %d\n", errno); + } else { + ksft_exit_fail_msg("unshare() error: unshare, errno %d\n", errno); } - fprintf(stderr, "error: unshare, errno %d\n", errno); - return 1; } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { - fprintf(stderr, "error: mount '/', errno %d\n", errno); - return 1; + ksft_exit_fail_msg("mount() error: Root filesystem private mount: Fail %d\n", errno); } /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */ if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) { - fprintf(stderr, "error: mount tmpfs, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("mount() error: Mounting tmpfs on /tmp: Fail %d\n", errno); } fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); if (fd == -1) { - fprintf(stderr, "error: open 1, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("openat() error: Open first temporary file: Fail %d\n", errno); } + if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) { - fprintf(stderr, "error: linkat, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("linkat() error: Linking the temporary file: Fail %d\n", errno); + /* Ensure fd is closed on failure */ + close(fd); } close(fd); fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); if (fd == -1) { - fprintf(stderr, "error: open 2, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("openat() error: Opening the second temporary file: Fail %d\n", errno); } + ksft_test_result_pass("Test : Success\n"); There is no need to print success here. + ksft_exit_pass(); return 0; } thanks, -- Shuah
Re: [PATCH v5 0/6] vhost: Add support of kthread API
I tested this series v5 with virtio-net regression tests, everything works fine. Tested-by: Lei Yang On Mon, Dec 30, 2024 at 8:46 PM Cindy Lu wrote: > > In commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), > the vhost now uses vhost_task and operates as a child of the > owner thread. This aligns with containerization principles. > However, this change has caused confusion for some legacy > userspace applications. Therefore, we are reintroducing > support for the kthread API. > > In this series, a new UAPI is implemented to allow > userspace applications to configure their thread mode. > > Changelog v2: > 1. Change the module_param's name to enforce_inherit_owner, and the default > value is true. > 2. Change the UAPI's name to VHOST_SET_INHERIT_FROM_OWNER. > > Changelog v3: > 1. Change the module_param's name to inherit_owner_default, and the default > value is true. > 2. Add a structure for task function; the worker will select a different > mode based on the value inherit_owner. > 3. device will have their own inherit_owner in struct vhost_dev > 4. Address other comments > > Changelog v4: > 1. remove the module_param, only keep the UAPI > 2. remove the structure for task function; change to use the function > pointer in vhost_worker > 3. fix the issue in vhost_worker_create and vhost_dev_ioctl > 4. Address other comments > > Changelog v5: > 1. Change wakeup and stop function pointers in struct vhost_worker to void. > 2. merging patches 4, 5, 6 in a single patch > 3. Fix spelling issues and address other comments. > > Tested with QEMU with kthread mode/task mode/kthread+task mode > > Cindy Lu (6): > vhost: Add a new parameter in vhost_dev to allow user select kthread > vhost: Add the vhost_worker to support kthread > vhost: Add the cgroup related function > vhost: Add worker related functions to support kthread > vhost: Add new UAPI to support change to task mode > vhost_scsi: Add check for inherit_owner status > > drivers/vhost/scsi.c | 8 ++ > drivers/vhost/vhost.c | 178 + > drivers/vhost/vhost.h | 4 + > include/uapi/linux/vhost.h | 19 > 4 files changed, 192 insertions(+), 17 deletions(-) > > -- > 2.45.0 > >
Re: [PATCH 1/3] remoteproc: k3-r5: Fix checks in k3_r5_rproc_{mbox_callback/kick}
On Tue, Dec 24, 2024 at 02:44:55PM +0530, Beleswar Padhi wrote: > Commit f3f11cfe8907 ("remoteproc: k3-r5: Acquire mailbox handle during > probe routine") introduced a check in the "k3_r5_rproc_mbox_callback()" > and "k3_r5_rproc_kick()" callbacks to exit if the remote core's state > was "RPROC_DETACHED". However, this caused issues in IPC-only mode, as > the default state of the core is set to "RPROC_DETACHED", and the > transition to "RPROC_ATTACHED" happens only after the "__rproc_attach()" > function has invoked "rproc_start_subdevices()". > > The "rproc_start_subdevices()" function triggers the probe of Virtio > RPMsg subdevices, which require the mailbox callbacks to be functional. > To resolve this, a new variable, "is_attach_ongoing", is introduced to > distinguish between core states: when a core is actually detached and > when it is in the process of being attached. The callbacks are updated > to return early only if the core is actually detached and not during an > ongoing attach operation in IPC-only mode. > > Reported-by: Siddharth Vadapalli > Closes: > https://lore.kernel.org/all/20240916083131.2801755-1-s-vadapa...@ti.com/ > Fixes: f3f11cfe8907 ("remoteproc: k3-r5: Acquire mailbox handle during probe > routine") > Signed-off-by: Beleswar Padhi Reviewed-by: Siddharth Vadapalli Regards, Siddharth.
Re: [PATCH 2/3] remoteproc: k3-dsp: Fix checks in k3_dsp_rproc_{mbox_callback/kick}
On Tue, Dec 24, 2024 at 02:44:56PM +0530, Beleswar Padhi wrote: > Commit ea1d6fb5b571 ("remoteproc: k3-dsp: Acquire mailbox handle during > probe routine") introduced a check in the "k3_dsp_rproc_mbox_callback()" > and "k3_dsp_rproc_kick()" callbacks to exit if the remote core's state > was "RPROC_DETACHED". However, this caused issues in IPC-only mode, as > the default state of the core is set to "RPROC_DETACHED", and the > transition to "RPROC_ATTACHED" happens only after the "__rproc_attach()" > function has invoked "rproc_start_subdevices()". > > The "rproc_start_subdevices()" function triggers the probe of Virtio > RPMsg subdevices, which require the mailbox callbacks to be functional. > To resolve this, a new variable, "is_attach_ongoing", is introduced to > distinguish between core states: when a core is actually detached and > when it is in the process of being attached. The callbacks are updated > to return early only if the core is actually detached and not during an > ongoing attach operation in IPC-only mode. > > Reported-by: Siddharth Vadapalli > Closes: > https://lore.kernel.org/all/20240916083131.2801755-1-s-vadapa...@ti.com/ > Fixes: ea1d6fb5b571 ("remoteproc: k3-dsp: Acquire mailbox handle during probe > routine") > Signed-off-by: Beleswar Padhi Reviewed-by: Siddharth Vadapalli Regards, Siddharth.
Re: [PATCH v2 28/28] cfi: Use RCU while invoking __module_address().
Hi Elliot, On Mon, Dec 30, 2024 at 7:33 PM Elliot Berman wrote: > > On Fri, Dec 20, 2024 at 06:41:42PM +0100, Sebastian Andrzej Siewior wrote: > > __module_address() can be invoked within a RCU section, there is no > > requirement to have preemption disabled. > > > > I'm not sure if using rcu_read_lock() will introduce the regression that > > has been fixed in commit 14c4c8e41511a ("cfi: Use > > rcu_read_{un}lock_sched_notrace"). > > > > You can replace the rcu_read_lock_sched_notrace() with guard(rcu)(). > Regular rcu lock doesn't generate function traces, so the recursive loop > isn't possible. > > I've tested: > - the current kernel (no recursive loop) > - Revert back to rcu_read_lock_sched() (fails) Which kernel version did you test? I assume something pre-KCFI as arm64 doesn't use this code since v6.1. > - Your series as-is (no recurisve loop) Note that this patch only adds a comment to is_module_cfi_trap(), so I wouldn't expect a functional change. Sami
Re: [PATCH v7 1/2] selftests: tmpfs: Add Test-skip if not run as root
On 1/2/25 03:04, Shivam Chaudhary wrote: Add 'ksft_exit_skip()', if not run as root, with an appropriate Warning. Add 'ksft_print_header()' and 'ksft_set_plan()' to structure test outputs more effectively. Test logs: Before Change: - Without root error: unshare, errno 1 - With root No, output After change: - Without root TAP version 13 1..1 ok 2 # SKIP This test needs root to run! Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 - With root TAP version 13 1..1 How are you running this test to see the before ad after results? thanks, -- Shuah
[RFC PATCH RESEND v2 0/2] Add file seal to prevent future exec mappings
* Resending because I accidentally forgot to include Lorenzo in the "to" list. Android uses the ashmem driver [1] for creating shared memory regions between processes. The ashmem driver exposes an ioctl command for processes to restrict the permissions an ashmem buffer can be mapped with. Buffers are created with the ability to be mapped as readable, writable, and executable. Processes remove the ability to map some ashmem buffers as executable to ensure that those buffers cannot be used to inject malicious code for another process to run. Other buffers retain their ability to be mapped as executable, as these buffers can be used for just-in-time (JIT) compilation. So there is a need to be able to remove the ability to map a buffer as executable on a per-buffer basis. Android is currently trying to migrate towards replacing its ashmem driver usage with memfd. Part of the transition involved introducing a library that serves to abstract away how shared memory regions are allocated (i.e. ashmem vs memfd). This allows clients to use a single interface for restricting how a buffer can be mapped without having to worry about how it is handled for ashmem (through the ioctl command mentioned earlier) or memfd (through file seals). While memfd has support for preventing buffers from being mapped as writable beyond a certain point in time (thanks to F_SEAL_FUTURE_WRITE), it does not have a similar interface to prevent buffers from being mapped as executable beyond a certain point. However, that could be implemented as a file seal (F_SEAL_FUTURE_EXEC) which works similarly to F_SEAL_FUTURE_WRITE. F_SEAL_FUTURE_WRITE was chosen as a template for how this new seal should behave, instead of F_SEAL_WRITE, for the following reasons: 1. Having the new seal behave like F_SEAL_FUTURE_WRITE matches the behavior that was present with ashmem. This aids in seamlessly transitioning clients away from ashmem to memfd. 2. Making the new seal behave like F_SEAL_WRITE would mean that no mappings that could become executable in the future (i.e. via mprotect()) can exist when the seal is applied. However, there are known cases (e.g. CursorWindow [2]) where restrictions are applied on how a buffer can be mapped after a mapping has already been made. That mapping may have VM_MAYEXEC set, which would not allow the seal to be applied successfully. Therefore, the F_SEAL_FUTURE_EXEC seal was designed to have the same semantics as F_SEAL_FUTURE_WRITE. Note: this series depends on Lorenzo's work [3], [4], [5] from Andrew Morton's mm-unstable branch [6], which reworks memfd's file seal checks, allowing for newer file seals to be implemented in a cleaner fashion. Changes from v1 ==> v2: - Changed the return code to be -EPERM instead of -EACCES when attempting to map an exec sealed file with PROT_EXEC to align to mmap()'s man page. Thank you Kalesh Singh for spotting this! - Rebased on top of Lorenzo's work to cleanup memfd file seal checks in mmap() ([3], [4], and [5]). Thank you for this Lorenzo! - Changed to deny PROT_EXEC mappings only if the mapping is shared, instead of for both shared and private mappings, after discussing this with Lorenzo. Opens: - Lorenzo brought up that this patch may negatively impact the usage of MFD_NOEXEC_SCOPE_NOEXEC_ENFORCED [7]. However, it is not clear to me why that is the case. At the moment, my intent is for the executable permissions of the file to be disjoint from the ability to create executable mappings. Links: [1] https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/drivers/staging/android/ashmem.c [2] https://developer.android.com/reference/android/database/CursorWindow [3] https://lore.kernel.org/all/cover.1732804776.git.lorenzo.stoa...@oracle.com/ [4] https://lkml.kernel.org/r/20241206212846.210835-1-lorenzo.stoa...@oracle.com [5] https://lkml.kernel.org/r/7dee6c5d-480b-4c24-b98e-6fa47dbd8a23@lucifer.local [6] https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git/tree/?h=mm-unstable [7] https://lore.kernel.org/all/3a53b154-1e46-45fb-a559-65afa7a8a788@lucifer.local/ Links to previous versions: v1: https://lore.kernel.org/all/20241206010930.3871336-1-isaacmanjar...@google.com/ Isaac J. Manjarres (2): mm/memfd: Add support for F_SEAL_FUTURE_EXEC to memfd selftests/memfd: Add tests for F_SEAL_FUTURE_EXEC include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 39 ++- tools/testing/selftests/memfd/memfd_test.c | 79 ++ 3 files changed, 118 insertions(+), 1 deletion(-) -- 2.47.1.613.gc27f4b7a9f-goog
[RFC PATCH RESEND v2 2/2] selftests/memfd: Add tests for F_SEAL_FUTURE_EXEC
Add tests to ensure that F_SEAL_FUTURE_EXEC behaves as expected. Signed-off-by: Isaac J. Manjarres --- tools/testing/selftests/memfd/memfd_test.c | 79 ++ 1 file changed, 79 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index c0c53451a16d..abc213a5ce99 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -31,6 +31,7 @@ #define STACK_SIZE 65536 #define F_SEAL_EXEC0x0020 +#define F_SEAL_FUTURE_EXEC 0x0040 #define F_WX_SEALS (F_SEAL_SHRINK | \ F_SEAL_GROW | \ @@ -318,6 +319,37 @@ static void *mfd_assert_mmap_private(int fd) return p; } +static void *mfd_fail_mmap_exec(int fd) +{ + void *p; + + p = mmap(NULL, +mfd_def_size, +PROT_EXEC, +MAP_SHARED, +fd, +0); + if (p != MAP_FAILED) { + printf("mmap() didn't fail as expected\n"); + abort(); + } + + return p; +} + +static void mfd_fail_mprotect_exec(void *p) +{ + int ret; + + ret = mprotect(p, + mfd_def_size, + PROT_EXEC); + if (!ret) { + printf("mprotect didn't fail as expected\n"); + abort(); + } +} + static int mfd_assert_open(int fd, int flags, mode_t mode) { char buf[512]; @@ -998,6 +1030,52 @@ static void test_seal_future_write(void) close(fd); } +/* + * Test SEAL_FUTURE_EXEC_MAPPING + * Test whether SEAL_FUTURE_EXEC_MAPPING actually prevents executable mappings. + */ +static void test_seal_future_exec_mapping(void) +{ + int fd; + void *p; + + + printf("%s SEAL-FUTURE-EXEC-MAPPING\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_future_exec_mapping", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + /* +* PROT_READ | PROT_WRITE mappings create VMAs with VM_MAYEXEC set. +* However, F_SEAL_FUTURE_EXEC applies to subsequent mappings, +* so it should still succeed even if this mapping is active when the +* seal is applied. +*/ + p = mfd_assert_mmap_shared(fd); + + mfd_assert_has_seals(fd, 0); + + mfd_assert_add_seals(fd, F_SEAL_FUTURE_EXEC); + mfd_assert_has_seals(fd, F_SEAL_FUTURE_EXEC); + + mfd_fail_mmap_exec(fd); + + munmap(p, mfd_def_size); + + /* Ensure that new mappings without PROT_EXEC work. */ + p = mfd_assert_mmap_shared(fd); + + /* +* Ensure that mappings created after the seal was applied cannot be +* made executable via mprotect(). +*/ + mfd_fail_mprotect_exec(p); + + munmap(p, mfd_def_size); + close(fd); +} + static void test_seal_write_map_read_shared(void) { int fd; @@ -1639,6 +1717,7 @@ int main(int argc, char **argv) test_seal_shrink(); test_seal_grow(); test_seal_resize(); + test_seal_future_exec_mapping(); if (pid_ns_supported()) { test_sysctl_simple(); -- 2.47.1.613.gc27f4b7a9f-goog
[PATCH v3 0/2] fix reading ESP during coredump
Hi, In /proc/PID/stat, there is the kstkesp field which is the stack pointer of a thread. While the thread is active, this field reads zero. But during a coredump, it should have a valid value. However, at the moment, kstkesp is zero even during coredump. The first commit fixes this problem, and the second commit adds a selftest to detect if this problem appears again in the future. v2..v3 https://lore.kernel.org/lkml/cover.1735550994.git.nam...@linutronix.de/ - Move stackdump file to local directory [Kees] - Always cleanup the stackdump file after the test [Kees] - Remove unused empty function v1..v2 https://lore.kernel.org/lkml/cover.1730883229.git.nam...@linutronix.de/ - Change the fix patch to use PF_POSTCOREDUMP [Oleg] Nam Cao (2): fs/proc: do_task_stat: Fix ESP not readable during coredump selftests: coredump: Add stackdump test fs/proc/array.c | 2 +- tools/testing/selftests/coredump/Makefile | 7 + tools/testing/selftests/coredump/README.rst | 50 ++ tools/testing/selftests/coredump/stackdump| 14 ++ .../selftests/coredump/stackdump_test.c | 151 ++ 5 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/coredump/Makefile create mode 100644 tools/testing/selftests/coredump/README.rst create mode 100755 tools/testing/selftests/coredump/stackdump create mode 100644 tools/testing/selftests/coredump/stackdump_test.c -- 2.39.5
[PATCH v3 1/2] fs/proc: do_task_stat: Fix ESP not readable during coredump
The field "eip" (instruction pointer) and "esp" (stack pointer) of a task can be read from /proc/PID/stat. These fields can be interesting for coredump. However, these fields were disabled by commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat"), because it is generally unsafe to do so. But it is safe for a coredumping process, and therefore exceptions were made: - for a coredumping thread by commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping"). - for all other threads in a coredumping process by commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads"). The above two commits check the PF_DUMPCORE flag to determine a coredump thread and the PF_EXITING flag for the other threads. Unfortunately, commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") moved coredump to happen earlier and before PF_EXITING is set. Thus, checking PF_EXITING is no longer the correct way to determine threads in a coredumping process. Instead of PF_EXITING, use PF_POSTCOREDUMP to determine the other threads. Checking of PF_EXITING was added for coredumping, so it probably can now be removed. But it doesn't hurt to keep. Fixes: 92307383082d ("coredump: Don't perform any cleanups before dumping core") Cc: sta...@vger.kernel.org Cc: Eric W. Biederman Acked-by: Oleg Nesterov Acked-by: Kees Cook Signed-off-by: Nam Cao --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 55ed3510d2bb..d6a0369caa93 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ - if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); -- 2.39.5
[PATCH v3 2/2] selftests: coredump: Add stackdump test
Add a test which checks that the kstkesp field in /proc/pid/stat can be read for all threads of a coredumping process. For full details including the motivation for this test and how it works, see the README file added by this commit. Reviewed-by: John Ogness Signed-off-by: Nam Cao --- tools/testing/selftests/coredump/Makefile | 7 + tools/testing/selftests/coredump/README.rst | 50 ++ tools/testing/selftests/coredump/stackdump| 14 ++ .../selftests/coredump/stackdump_test.c | 151 ++ 4 files changed, 222 insertions(+) create mode 100644 tools/testing/selftests/coredump/Makefile create mode 100644 tools/testing/selftests/coredump/README.rst create mode 100755 tools/testing/selftests/coredump/stackdump create mode 100644 tools/testing/selftests/coredump/stackdump_test.c diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile new file mode 100644 index ..ed210037b29d --- /dev/null +++ b/tools/testing/selftests/coredump/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS = $(KHDR_INCLUDES) + +TEST_GEN_PROGS := stackdump_test +TEST_FILES := stackdump + +include ../lib.mk diff --git a/tools/testing/selftests/coredump/README.rst b/tools/testing/selftests/coredump/README.rst new file mode 100644 index ..164a7aa181c8 --- /dev/null +++ b/tools/testing/selftests/coredump/README.rst @@ -0,0 +1,50 @@ +coredump selftest += + +Background context +-- + +`coredump` is a feature which dumps a process's memory space when the process terminates +unexpectedly (e.g. due to segmentation fault), which can be useful for debugging. By default, +`coredump` dumps the memory to the file named `core`, but this behavior can be changed by writing a +different file name to `/proc/sys/kernel/core_pattern`. Furthermore, `coredump` can be piped to a +user-space program by writing the pipe symbol (`|`) followed by the command to be executed to +`/proc/sys/kernel/core_pattern`. For the full description, see `man 5 core`. + +The piped user program may be interested in reading the stack pointers of the crashed process. The +crashed process's stack pointers can be read from `procfs`: it is the `kstkesp` field in +`/proc/$PID/stat`. See `man 5 proc` for all the details. + +The problem +--- +While a thread is active, the stack pointer is unsafe to read and therefore the `kstkesp` field +reads zero. But when the thread is dead (e.g. during a coredump), this field should have valid +value. + +However, this was broken in the past and `kstkesp` was zero even during coredump: + +* commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") changed kstkesp to + always be zero + +* commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping") fixed it for the + coredumping thread. However, other threads in a coredumping process still had the problem. + +* commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads") fixed + for all threads in a coredumping process. + +* commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") broke it again + for the other threads in a coredumping process. + +The problem has been fixed now, but considering the history, it may appear again in the future. + +The goal of this test +- +This test detects problem with reading `kstkesp` during coredump by doing the following: + +#. Tell the kernel to execute the "stackdump" script when a coredump happens. This script + reads the stack pointers of all threads of crashed processes. + +#. Spawn a child process who creates some threads and then crashes. + +#. Read the output from the "stackdump" script, and make sure all stack pointer values are + non-zero. diff --git a/tools/testing/selftests/coredump/stackdump b/tools/testing/selftests/coredump/stackdump new file mode 100755 index ..96714ce42d12 --- /dev/null +++ b/tools/testing/selftests/coredump/stackdump @@ -0,0 +1,14 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +CRASH_PROGRAM_ID=$1 +STACKDUMP_FILE=$2 + +TMP=$(mktemp) + +for t in /proc/$CRASH_PROGRAM_ID/task/*; do + tid=$(basename $t) + cat /proc/$tid/stat | awk '{print $29}' >> $TMP +done + +mv $TMP $STACKDUMP_FILE diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c new file mode 100644 index ..137b2364a082 --- /dev/null +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define STACKDUMP_FILE "stack_values" +#define STACKDUMP_SCRIPT "stackdump" +#define NUM_THREAD_SPAWN 128 + +static void *do_nothing(void *) +{ + while (1) + pause(); +} + +static void crashi
[RFC PATCH RESEND v2 1/2] mm/memfd: Add support for F_SEAL_FUTURE_EXEC to memfd
Android currently uses the ashmem driver [1] for creating shared memory regions between processes. Ashmem buffers can initially be mapped with PROT_READ, PROT_WRITE, and PROT_EXEC. Processes can then use the ASHMEM_SET_PROT_MASK ioctl command to restrict--never add--the permissions that the buffer can be mapped with. Processes can remove the ability to map ashmem buffers as executable to ensure that those buffers cannot be exploited to run unintended code. For instance, suppose process A allocates a memfd that is meant to be read and written by itself and another process, call it B. Process A shares the buffer with process B, but process B injects code into the buffer, and compromises process A, such that it makes A map the buffer with PROT_EXEC. This provides an opportunity for process A to run the code that process B injected into the buffer. If process A had the ability to seal the buffer against future executable mappings before sharing the buffer with process B, this attack would not be possible. Android is currently trying to replace ashmem with memfd. However, memfd does not have a provision to permanently remove the ability to map a buffer as executable, and leaves itself open to the type of attack described earlier. However, this should be something that can be achieved via a new file seal. There are known usecases (e.g. CursorWindow [2]) where a process maps a buffer with read/write permissions before restricting the buffer to being mapped as read-only for future mappings. The resulting VMA from the writable mapping has VM_MAYEXEC set, meaning that mprotect() can change the mapping to be executable. Therefore, implementing the seal similar to F_SEAL_WRITE would not be appropriate, since it would not work with the CursorWindow usecase. This is because the CursorWindow process restricts the mapping permissions to read-only after the writable mapping is created. So, adding a file seal for executable mappings that operates like F_SEAL_WRITE would fail. Therefore, add support for F_SEAL_FUTURE_EXEC, which is handled similarly to F_SEAL_FUTURE_WRITE. This ensures that CursorWindow can continue to create a writable mapping initially, and then restrict the permissions on the buffer to be mappable as read-only by using both F_SEAL_FUTURE_WRITE and F_SEAL_FUTURE_EXEC. After the seal is applied, any calls to mmap() with PROT_EXEC will fail. [1] https://cs.android.com/android/kernel/superproject/+/common-android-mainline:common/drivers/staging/android/ashmem.c [2] https://developer.android.com/reference/android/database/CursorWindow Signed-off-by: Isaac J. Manjarres --- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 39 +- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6e6907e63bfc..ef066e524777 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -49,6 +49,7 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #define F_SEAL_FUTURE_WRITE0x0010 /* prevent future writes while mapped */ #define F_SEAL_EXEC0x0020 /* prevent chmod modifying exec bits */ +#define F_SEAL_FUTURE_EXEC 0x0040 /* prevent future executable mappings */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 5f5a23c9051d..cfd62454df5e 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -184,6 +184,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) } #define F_ALL_SEALS (F_SEAL_SEAL | \ +F_SEAL_FUTURE_EXEC |\ F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ @@ -357,14 +358,50 @@ static int check_write_seal(unsigned long *vm_flags_ptr) return 0; } +static inline bool is_exec_sealed(unsigned int seals) +{ + return seals & F_SEAL_FUTURE_EXEC; +} + +static int check_exec_seal(unsigned long *vm_flags_ptr) +{ + unsigned long vm_flags = *vm_flags_ptr; + unsigned long mask = vm_flags & (VM_SHARED | VM_EXEC); + + /* Executability is not a concern for private mappings. */ + if (!(mask & VM_SHARED)) + return 0; + + /* +* New PROT_EXEC and MAP_SHARED mmaps are not allowed when exec seal +* is active. +*/ + if (mask & VM_EXEC) + return -EPERM; + + /* +* Prevent mprotect() from making an exec-sealed mapping executable in +* the future. +*/ + *vm_flags_ptr &= ~VM_MAYEXEC; + + return 0; +} + int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); unsigned int seals = seals_ptr ? *seals_ptr : 0; - if (is_write_sealed(seals)) + if (is_write_sealed(seals)) { err = check_write_seal(vm_flags_ptr); + if (err) + return
Re: [PATCH v2 28/28] cfi: Use RCU while invoking __module_address().
Hi Petr, On Mon, Dec 30, 2024 at 1:13 PM Petr Pavlu wrote: > > On 12/20/24 18:41, Sebastian Andrzej Siewior wrote: > > __module_address() can be invoked within a RCU section, there is no > > requirement to have preemption disabled. > > > > I'm not sure if using rcu_read_lock() will introduce the regression that > > has been fixed in commit 14c4c8e41511a ("cfi: Use > > rcu_read_{un}lock_sched_notrace"). > > > > Cc: Elliot Berman > > Cc: Kees Cook > > Cc: Nathan Chancellor > > Cc: Sami Tolvanen > > Cc: Steven Rostedt > > Cc: l...@lists.linux.dev > > Signed-off-by: Sebastian Andrzej Siewior > > --- > > kernel/cfi.c | 4 > > 1 file changed, 4 insertions(+) > > > > diff --git a/kernel/cfi.c b/kernel/cfi.c > > index 08caad7767176..c8f2b5a51b2e6 100644 > > --- a/kernel/cfi.c > > +++ b/kernel/cfi.c > > @@ -71,6 +71,10 @@ static bool is_module_cfi_trap(unsigned long addr) > > struct module *mod; > > bool found = false; > > > > + /* > > + * XXX this could be RCU protected but would it introcude the > > regression > > + * fixed in 14c4c8e41511a ("cfi: Use rcu_read_{un}lock_sched_notrace") > > + */ > > rcu_read_lock_sched_notrace(); > > > > mod = __module_address(addr); > > I think that since 89245600941e ("cfi: Switch to -fsanitize=kcfi"), this > can be a call to rcu_read_lock_sched(), or in your case rcu_read_lock(). > The recursive case where __cfi_slowpath_diag() could end up calling > itself is no longer present, as all that logic is gone. I then don't see > another reason this should use the notrace variant. > > @Sami, could you please confirm this? Switching is_module_cfi_trap() to use rcu_read_lock() in this series should be fine. KCFI checks don't perform function calls, so there's no risk of recursion, and this function is only called during the error handling path. Sami
Re: watchdog: BUG: soft lockup in note_gp_changes in kernel/rcu/tree.c
On Thu, Jan 02, 2025 at 10:59:27AM +0800, Kun Hu wrote: > Hello, > > When using our customed fuzzer tool to fuzz the latest Linux kernel, the > following crash > was triggered. > > HEAD commit: dbfac60febfa806abb2d384cb6441e77335d2799 > git tree: upstream > Console output: > https://drive.google.com/file/d/1D3EDxDxPi0t7m_Z4Uc4FuL26DnHs7yTa/view?usp=sharing > Kernel config: > https://drive.google.com/file/d/1m1mk_YusR-tyusNHFuRbzdj8KUzhkeHC/view?usp=sharing > C reproducer: / > Syzlang reproducer: / > > We observed a crash at line 1333 in note_gp_changes, likely caused by a race > condition involving rcu_gp_kthread_wake and note_gp_changes. The issue > appears to involve insufficient or incorrect synchronization, as indicated by > the involvement of _raw_spin_unlock_irqrestore in spinlock.c. Specifically, > this may lead to invalid accesses to rcu_state.gp_kthread or related flags > (e.g., gp_flags), potentially resulting in unexpected behavior in > swake_up_one_online. > > Could you please help check if this needs to be addressed? This is a new one on me. This is running in a guest OS. Might the underlying hypervisor be overloaded? That could result in vCPU preemption and thus in this sort of soft lockup. Also, when I check out the above commit (which is v6.13-rc4), I find that line 1333 is the close curly brace of note_gp_changes(). Of course, it is possible that the address-to-symbol translation failed (please check!), but in the absence of such failure, there is no way that I know of that incorrect synchronization could cause a soft lockup at that location. Other things besides vCPU preemption that could cause a soft lockup at that location include corrupted kernel text, corrupted kernel stack, and incessant interrupts. Other thoughts? Thanx, Paul > If you fix this issue, please add the following tag to the commit: > Reported-by: Kun Hu , Jiaji Qin > > -- > watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [syz.0.745:6431] > Modules linked in: > irq event stamp: 312081 > hardirqs last enabled at (312080): [] > irqentry_exit+0x3b/0x90 kernel/entry/common.c:357 > hardirqs last disabled at (312081): [] > sysvec_apic_timer_interrupt+0xf/0xb0 arch/x86/kernel/apic/apic.c:1049 > softirqs last enabled at (312032): [] softirq_handle_end > kernel/softirq.c:407 [inline] > softirqs last enabled at (312032): [] > handle_softirqs+0x544/0x870 kernel/softirq.c:589 > softirqs last disabled at (312035): [] __do_softirq > kernel/softirq.c:595 [inline] > softirqs last disabled at (312035): [] invoke_softirq > kernel/softirq.c:435 [inline] > softirqs last disabled at (312035): [] __irq_exit_rcu > kernel/softirq.c:662 [inline] > softirqs last disabled at (312035): [] > irq_exit_rcu+0xee/0x140 kernel/softirq.c:678 > CPU: 1 UID: 0 PID: 6431 Comm: syz.0.745 Not tainted 6.13.0-rc4 #1 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 > 04/01/2014 > RIP: 0010:note_gp_changes+0xd8/0x1d0 kernel/rcu/tree.c:1333 > Code: c0 0f 85 d4 00 00 00 0f b6 45 14 84 c0 75 27 48 85 db 0f 85 a5 00 00 00 > 9c 58 f6 c4 02 0f 85 ae 00 00 00 48 85 db 74 01 fb 5b <5d> 41 5c 41 5d 41 5e > e9 47 59 ff 07 4c 89 e7 e8 04 bb de 07 85 c0 > RSP: 0018:ffa0001e8e38 EFLAGS: 0206 > RAX: 0002 RBX: ff116a2bd240 RCX: 11eb7361 > RDX: RSI: 0101 RDI: > RBP: ff116a2bd240 R08: 0001 R09: 0001 > R10: fbfff1eb7aaa R11: 8f5bd557 R12: 8d726380 > R13: 0202 R14: 000296e9 R15: ff1118c8 > FS: () GS:ff116a28() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 7f69243b2260 CR3: 336f0002 CR4: 00771ef0 > PKRU: 8000 > Call Trace: > > rcu_check_quiescent_state kernel/rcu/tree.c:2451 [inline] > rcu_core+0x18c/0x16c0 kernel/rcu/tree.c:2807 > handle_softirqs+0x1ad/0x870 kernel/softirq.c:561 > __do_softirq kernel/softirq.c:595 [inline] > invoke_softirq kernel/softirq.c:435 [inline] > __irq_exit_rcu kernel/softirq.c:662 [inline] > irq_exit_rcu+0xee/0x140 kernel/softirq.c:678 > instr_sysvec_irq_work arch/x86/kernel/irq_work.c:17 [inline] > sysvec_irq_work+0xca/0xf0 arch/x86/kernel/irq_work.c:17 > > > asm_sysvec_irq_work+0x1a/0x20 arch/x86/include/asm/idtentry.h:738 > RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 > [inline] > RIP: 0010:_raw_spin_unlock_irqrestore+0x3c/0x70 kernel/locking/spinlock.c:194 > Code: 74 24 10 e8 16 b5 17 f8 48 89 ef e8 9e 35 18 f8 81 e3 00 02 00 00 75 29 > 9c 58 f6 c4 02 75 35 48 85 db 74 01 fb bf 01 00 00 00 8f 53 0c f8 65 8b > 05 50 0c 14 75 85 c0 74 0e 5b 5d e9 c8 9b 20 > RSP: 0018:ffa003e07938 EFLAGS: 0206 > RAX: 0006 RBX: 0200 RCX: 11eb7361 > RDX: RSI:
Re: [PATCH for-next v2] selftests/Makefile: override the srctree for out-of-tree builds
2024-12-23 16:33 UTC+0800 ~ Li Zhijian > Fixes an issue where out-of-tree kselftest builds fail when building > the BPF and bpftools components. The failure occurs because the top-level > Makefile passes a relative srctree path to its sub-Makefiles, which > leads to errors in locating necessary files. > > For example, the following error is encountered: > > ``` > $ make V=1 O=$build/ TARGETS=hid kselftest-all > ... > make -C ../tools/testing/selftests all > make[4]: Entering directory '/path/to/linux/tools/testing/selftests/hid' > make -C /path/to/linux/tools/testing/selftests/../../../tools/lib/bpf > OUTPUT=/path/to/linux/O/kselftest/hid/tools/build/libbpf/ \ > EXTRA_CFLAGS='-g -O0' \ > DESTDIR=/path/to/linux/O/kselftest/hid/tools prefix= all > install_headers > make[5]: Entering directory '/path/to/linux/tools/lib/bpf' > ... > make[5]: Entering directory '/path/to/linux/tools/bpf/bpftool' > Makefile:127: ../tools/build/Makefile.feature: No such file or directory > make[5]: *** No rule to make target '../tools/build/Makefile.feature'. Stop. > ``` > > To resolve this, override the srctree in the kselftests's top Makefile > when performing an out-of-tree build. This ensures that all sub-Makefiles > have the correct path to the source tree, preventing directory resolution > errors. > > Signed-off-by: Li Zhijian I simply tested with "make V=1 O=build/ TARGETS=hid kselftest-all", this approach fixes the build just as well as v1. Thanks! Tested-by: Quentin Monnet
Re: [PATCH v4 2/2] livepatch: Convert timeouts to secs_to_jiffies()
On Wed 2024-12-18 09:35:46, Easwar Hariharan wrote: > On 12/18/2024 12:48 AM, Christophe Leroy wrote: > > > > > > Le 18/12/2024 à 09:38, Petr Mladek a écrit : > >> On Tue 2024-12-17 23:09:59, Easwar Hariharan wrote: > >>> Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced > >>> secs_to_jiffies(). As the value here is a multiple of 1000, use > >>> secs_to_jiffies() instead of msecs_to_jiffies to avoid the > >>> multiplication. > >>> > >>> This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci > >>> with > >>> the following Coccinelle rules: > >>> > >>> @@ constant C; @@ > >>> > >>> - msecs_to_jiffies(C * 1000) > >>> + secs_to_jiffies(C) > >>> > >>> @@ constant C; @@ > >>> > >>> - msecs_to_jiffies(C * MSEC_PER_SEC) > >>> + secs_to_jiffies(C) > >>> > >>> While here, replace the schedule_delayed_work() call with a 0 timeout > >>> with an immediate schedule_work() call. > >>> > >>> --- a/samples/livepatch/livepatch-callbacks-busymod.c > >>> +++ b/samples/livepatch/livepatch-callbacks-busymod.c > >>> @@ -44,8 +44,7 @@ static void busymod_work_func(struct work_struct > >>> *work) > >>> static int livepatch_callbacks_mod_init(void) > >>> { > >>> pr_info("%s\n", __func__); > >>> - schedule_delayed_work(&work, > >>> - msecs_to_jiffies(1000 * 0)); > >>> + schedule_work(&work); > >> > >> Is it safe to use schedule_work() for struct delayed_work? > > > > Should be, but you are right it should then be a standard work not a > > delayed work. > > > > So probably the easiest is to keep > > > > schedule_delayed_work(&work, 0) > > > > And eventually changing it to a not delayed work could be a follow-up > > patch. > > > >> > > Thanks for the catch, Petr! This suggestion would effectively revert > this patch to the v3 version, albeit with some extra explanation in the > commit message. I'd propose just keeping the v3 in the next branch where > it is. > > Andrew, Petr, Christophe, what do you think? I am fine with keeping v3 in next. Best Regards, Petr
Re: [PATCH] sev-snp: parse MP tables for VMware hypervisor
On Thu, Dec 26, 2024 at 9:26 PM Kevin Loughlin wrote: > > On Thu, Dec 19, 2024 at 6:44 AM Ajay Kaher wrote: > > > > For VMware hypervisor, SEV-SNP enabled VM's could boot without UEFI. > > In this case, mpparse_find_mptable() has to be called to parse MP > > tables which contains boot information. > > > > Fixes: 0f4a1e80989a ("x86/sev: Skip ROM range scans and validation for > > SEV-SNP guests") > > Signed-off-by: Ajay Kaher > > Signed-off-by: Ye Li > > Tested-by: Ye Li > > --- > > arch/x86/kernel/cpu/vmware.c | 7 +++ > > 1 file changed, 7 insertions(+) > > > > diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c > > index 00189cd..3e2594d 100644 > > --- a/arch/x86/kernel/cpu/vmware.c > > +++ b/arch/x86/kernel/cpu/vmware.c > > @@ -26,6 +26,7 @@ > > #include > > #include > > #include > > +#include > > #include > > #include > > #include > > @@ -35,6 +36,8 @@ > > #include > > #include > > #include > > +#include > > +#include > > > > #undef pr_fmt > > #define pr_fmt(fmt)"vmware: " fmt > > @@ -429,6 +432,10 @@ static void __init vmware_platform_setup(void) > > pr_warn("Failed to get TSC freq from the hypervisor\n"); > > } > > > > + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED && > > + !efi_enabled(EFI_BOOT)) > > + x86_init.mpparse.find_mptable = mpparse_find_mptable; > > As far I know, Linux itself currently doesn't PVALIDATE the address > ranges scanned in mpparse_find_mptable(), and Linux accesses these > addresses as encrypted during early boot. Given this, am I correct > that the guest firmware that you're using is doing the PVALIDATE of > these ranges before starting Linux (else there would be a crash upon > scan)? And then presumably the firmware is also making sure that this > memory is encrypted so that Linux isn't reading unencrypted data as > encrypted (i.e., garbage)? Yes, Linux (as a guest) receives valid data. > If so, does that mean all the ROM region scans removed in [0] are > permissible for SEV-SNP guests booting on whichever guest firmware > this is? But you only want/need the mptable info here? Yes, VMware firmware validates MPTABLE info when 'non-EFI guest boots on VMware Hypervisor'. The other ROM regions removed in [0] were not encrypted and validated. This is a specific use case for the VMware platform. Linux today assumes SNP VMs will be booted with UEFI. > [0] 0f4a1e80989a ("x86/sev: Skip ROM range scans and validation for > SEV-SNP guests") > > More broadly, ideally the guest firmware would communicate to Linux > that these ranges are safe to access (perhaps via the e820 table), > rather than Linux making the assumption that the ranges are safe for > non-EFI SEV-SNP guest boots in this scenario. However, since you're > only changing vmware_platform_setup() for such boots, I don't think we > need to hold up this patch on that generality. This is a VMware specific scenario. VMware Hypervisor makes the MPTABLE memory available in e820 when 'non-EFI guest boots on VMware Hypervisor'. So it makes sense to do the changes only for vmware_platform_setup(). - Ajay smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH v7 02/16] rust: implement generic driver registration
Hi Gary, On Tue, Dec 24, 2024 at 07:58:21PM +, Gary Guo wrote: > On Thu, 19 Dec 2024 18:04:04 +0100 > Danilo Krummrich wrote: > > > Implement the generic `Registration` type and the `RegistrationOps` > > trait. > > > > The `Registration` structure is the common type that represents a driver > > registration and is typically bound to the lifetime of a module. However, > > it doesn't implement actual calls to the kernel's driver core to register > > drivers itself. > > > > Instead the `RegistrationOps` trait is provided to subsystems, which have > > to implement `RegistrationOps::register` and > > `RegistrationOps::unregister`. Subsystems have to provide an > > implementation for both of those methods where the subsystem specific > > variants to register / unregister a driver have to implemented. > > > > For instance, the PCI subsystem would call __pci_register_driver() from > > `RegistrationOps::register` and pci_unregister_driver() from > > `DrvierOps::unregister`. > > > > Co-developed-by: Wedson Almeida Filho > > Signed-off-by: Wedson Almeida Filho > > Signed-off-by: Danilo Krummrich > > Hi Danilo, > > I think there're soundness issues with this API, please see comments > inlined below. Just in case you did not note, this series has been applied to the driver-core tree already. > > Best, > Gary > > > --- > > MAINTAINERS | 1 + > > rust/kernel/driver.rs | 117 ++ > > rust/kernel/lib.rs| 1 + > > 3 files changed, 119 insertions(+) > > create mode 100644 rust/kernel/driver.rs > > > > diff --git a/MAINTAINERS b/MAINTAINERS > > index baf0eeb9a355..2ad58ed40079 100644 > > --- a/MAINTAINERS > > +++ b/MAINTAINERS > > @@ -7033,6 +7033,7 @@ F:include/linux/kobj* > > F: include/linux/property.h > > F: lib/kobj* > > F: rust/kernel/device.rs > > +F: rust/kernel/driver.rs > > > > DRIVERS FOR OMAP ADAPTIVE VOLTAGE SCALING (AVS) > > M: Nishanth Menon > > diff --git a/rust/kernel/driver.rs b/rust/kernel/driver.rs > > new file mode 100644 > > index ..c1957ee7bb7e > > --- /dev/null > > +++ b/rust/kernel/driver.rs > > @@ -0,0 +1,117 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > + > > +//! Generic support for drivers of different buses (e.g., PCI, Platform, > > Amba, etc.). > > +//! > > +//! Each bus / subsystem is expected to implement [`RegistrationOps`], > > which allows drivers to > > +//! register using the [`Registration`] class. > > + > > +use crate::error::{Error, Result}; > > +use crate::{init::PinInit, str::CStr, try_pin_init, types::Opaque, > > ThisModule}; > > +use core::pin::Pin; > > +use macros::{pin_data, pinned_drop}; > > + > > +/// The [`RegistrationOps`] trait serves as generic interface for > > subsystems (e.g., PCI, Platform, > > +/// Amba, etc.) to provide the corresponding subsystem specific > > implementation to register / > > +/// unregister a driver of the particular type (`RegType`). > > +/// > > +/// For instance, the PCI subsystem would set `RegType` to > > `bindings::pci_driver` and call > > +/// `bindings::__pci_register_driver` from `RegistrationOps::register` and > > +/// `bindings::pci_unregister_driver` from `RegistrationOps::unregister`. > > +pub trait RegistrationOps { > > +/// The type that holds information about the registration. This is > > typically a struct defined > > +/// by the C portion of the kernel. > > +type RegType: Default; > > + > > +/// Registers a driver. > > +/// > > +/// On success, `reg` must remain pinned and valid until the matching > > call to > > +/// [`RegistrationOps::unregister`]. > > This looks like an obligation for the caller, so this function should > be unsafe? > > > +fn register( > > +reg: &Opaque, > > +name: &'static CStr, > > +module: &'static ThisModule, > > +) -> Result; > > + > > +/// Unregisters a driver previously registered with > > [`RegistrationOps::register`]. > > Similarly this is an obligation for the caller. I think you're right. I didn't want this to be unsafe, and when I got rid of the raw pointer, and hence removed the unsafe, I did miss the fact you point out here. It's a bit unfortunate, especially, since `register` and `unregister` are only ever called from this file. I'll send a patch to fix it up, unless you want to send one yourself. > > > +fn unregister(reg: &Opaque); > > +} > > + > > +/// A [`Registration`] is a generic type that represents the registration > > of some driver type (e.g. > > +/// `bindings::pci_driver`). Therefore a [`Registration`] must be > > initialized with a type that > > +/// implements the [`RegistrationOps`] trait, such that the generic > > `T::register` and > > +/// `T::unregister` calls result in the subsystem specific registration > > calls. > > +/// > > +///Once the `Registration` structure is dropped, the driver is > > unregistered. > > +#[pin_data(PinnedDrop)] > > +pub struct Registration { > > +#[pin] > >
Re: [PATCH v7 02/18] gendwarfksyms: Add address matching
Hi, On Sat, Dec 28, 2024 at 4:29 PM Masahiro Yamada wrote: > > On Fri, Dec 20, 2024 at 6:07 AM Sami Tolvanen wrote: > > > > diff --git a/scripts/gendwarfksyms/symbols.c > > b/scripts/gendwarfksyms/symbols.c > > index 7adf2ed9b89b..98febb524dd5 100644 > > --- a/scripts/gendwarfksyms/symbols.c > > +++ b/scripts/gendwarfksyms/symbols.c > > @@ -6,8 +6,39 @@ > > #include "gendwarfksyms.h" > > > > #define SYMBOL_HASH_BITS 12 > > + > > +/* struct symbol_addr -> struct symbol */ > > +static HASHTABLE_DEFINE(symbol_addrs, 1 << SYMBOL_HASH_BITS); > > +/* name -> struct symbol */ > > I think this comment addition should belong to 01/18 instead of 02/18. Yeah, agreed. I'll send out v8 with these comments addressed. Sami
Re: [PATCH v7 01/18] tools: Add gendwarfksyms
Hi Masahiro, On Fri, Dec 27, 2024 at 6:02 AM Masahiro Yamada wrote: > > On Fri, Dec 20, 2024 at 6:07 AM Sami Tolvanen wrote: > > > +int main(int argc, char **argv) > > +{ > > + unsigned int n; > > + int opt; > > + > > + struct option opts[] = { { "debug", 0, NULL, 'd' }, > > You can add "static const" to opts[] here. > > I like the array initializer formatted like follows: > > static const struct option opts[] = { > { "debug", 0, NULL, 'd' }, > { "dump-dies", 0, &dump_dies, 1 }, > { "help", 0, NULL, 'h' }, > { 0, 0, NULL, 0 } > }; Sure, I'll change this. Sami
Re: [PATCH bpf-next v6 5/5] bpf: selftests: verifier: Add nullness elision tests
On Thu, 2024-12-19 at 21:09 -0700, Daniel Xu wrote: > Test that nullness elision works for common use cases. For example, we > want to check that both constant scalar spills and STACK_ZERO functions. > As well as when there's both const and non-const values of R2 leading up > to a lookup. And obviously some bound checks. > > Particularly tricky are spills both smaller or larger than key size. For > smaller, we need to ensure verifier doesn't let through a potential read > into unchecked bytes. For larger, endianness comes into play, as the > native endian value tracked in the verifier may not be the bytes the > kernel would have read out of the key pointer. So check that we disallow > both. > > Signed-off-by: Daniel Xu > --- Nit: a few cases are omitted from these tests: 1. a key is not a pointer to stack; 2. a key is not a constant pointer; 3. value on stack is not a spill; 4. precision mark for stack value; I think (1) and (4) are worth testing. For (4) I think matching log messages about marking stack slot precise should be sufficient (register and slot index numbers are not guaranteed since tests are written in C, but regular expressions could be used in __msg using '{{' '}}' brackets, e.g.: '{{r[0-9]}}'. An alternative would be to write a test case where precision mark matters (a branch before map lookup with invalid key value used in a branch that is verified second). Such test would have to be written in inline assembly to guarantee code verification order. This might be an overkill. [...]
[PATCH] rseq/selftests: Fix riscv rseq_offset_deref_addv inline asm
When working on OpenRISC support for restartable sequences I noticed and fixed these two issues with the riscv support bits. 1 The 'inc' argument to RSEQ_ASM_OP_R_DEREF_ADDV was being implicitly passed to the macro. Fix this by adding 'inc' to the list of macro arguments. 2 The inline asm input constraints for 'inc' and 'off' use "er", The riscv gcc port does not have an "e" constraint, this looks to be copied from the x86 port. Fix this by just using an "r" constraint. I have compile tested this only for riscv. However, the same fixes I use in the OpenRISC rseq selftests and everything passes with no issues. Signed-off-by: Stafford Horne --- tools/testing/selftests/rseq/rseq-riscv-bits.h | 6 +++--- tools/testing/selftests/rseq/rseq-riscv.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq-riscv-bits.h b/tools/testing/selftests/rseq/rseq-riscv-bits.h index de31a0143139..f02f411d550d 100644 --- a/tools/testing/selftests/rseq/rseq-riscv-bits.h +++ b/tools/testing/selftests/rseq/rseq-riscv-bits.h @@ -243,7 +243,7 @@ int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, i #ifdef RSEQ_COMPARE_TWICE RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]") #endif - RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, 3) + RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, 3) RSEQ_INJECT_ASM(4) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ @@ -251,8 +251,8 @@ int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, i [current_cpu_id]"m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD), [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [ptr] "r" (ptr), - [off] "er" (off), - [inc] "er" (inc) + [off] "r" (off), + [inc] "r" (inc) RSEQ_INJECT_INPUT : "memory", RSEQ_ASM_TMP_REG_1 RSEQ_INJECT_CLOBBER diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h index 37e598d0a365..67d544aaa9a3 100644 --- a/tools/testing/selftests/rseq/rseq-riscv.h +++ b/tools/testing/selftests/rseq/rseq-riscv.h @@ -158,7 +158,7 @@ do { \ "bnez " RSEQ_ASM_TMP_REG_1 ", 222b\n" \ "333:\n" -#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, post_commit_label) \ +#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, post_commit_label) \ "mv " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(ptr) "]\n" \ RSEQ_ASM_OP_R_ADD(off) \ REG_L RSEQ_ASM_TMP_REG_1 ", 0(" RSEQ_ASM_TMP_REG_1 ")\n"\ -- 2.47.0
Re: [PATCH v7 04/16] rust: add rcu abstraction
On Tue, Dec 24, 2024 at 08:54:50PM +, Gary Guo wrote: > On Thu, 19 Dec 2024 18:04:06 +0100 > Danilo Krummrich wrote: > > > From: Wedson Almeida Filho > > > > Add a simple abstraction to guard critical code sections with an rcu > > read lock. > > > > Reviewed-by: Boqun Feng > > Signed-off-by: Wedson Almeida Filho > > Co-developed-by: Danilo Krummrich > > Signed-off-by: Danilo Krummrich > > --- > > MAINTAINERS | 1 + > > rust/helpers/helpers.c | 1 + > > rust/helpers/rcu.c | 13 > > rust/kernel/sync.rs | 1 + > > rust/kernel/sync/rcu.rs | 47 + > > 5 files changed, 63 insertions(+) > > create mode 100644 rust/helpers/rcu.c > > create mode 100644 rust/kernel/sync/rcu.rs > > [resend to the list] > > > > > diff --git a/MAINTAINERS b/MAINTAINERS > > index 3cfb68650347..0cc69e282889 100644 > > --- a/MAINTAINERS > > +++ b/MAINTAINERS > > @@ -19690,6 +19690,7 @@ T: git > > git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev > > F: Documentation/RCU/ > > F: include/linux/rcu* > > F: kernel/rcu/ > > +F: rust/kernel/sync/rcu.rs > > X: Documentation/RCU/torture.rst > > X: include/linux/srcu*.h > > X: kernel/rcu/srcu*.c > > diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c > > index dcf827a61b52..060750af6524 100644 > > --- a/rust/helpers/helpers.c > > +++ b/rust/helpers/helpers.c > > @@ -20,6 +20,7 @@ > > #include "page.c" > > #include "pid_namespace.c" > > #include "rbtree.c" > > +#include "rcu.c" > > #include "refcount.c" > > #include "security.c" > > #include "signal.c" > > diff --git a/rust/helpers/rcu.c b/rust/helpers/rcu.c > > new file mode 100644 > > index ..f1cec6583513 > > --- /dev/null > > +++ b/rust/helpers/rcu.c > > @@ -0,0 +1,13 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > + > > +#include > > + > > +void rust_helper_rcu_read_lock(void) > > +{ > > + rcu_read_lock(); > > +} > > + > > +void rust_helper_rcu_read_unlock(void) > > +{ > > + rcu_read_unlock(); > > +} > > diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs > > index 1eab7ebf25fd..0654008198b2 100644 > > --- a/rust/kernel/sync.rs > > +++ b/rust/kernel/sync.rs > > @@ -12,6 +12,7 @@ > > pub mod lock; > > mod locked_by; > > pub mod poll; > > +pub mod rcu; > > > > pub use arc::{Arc, ArcBorrow, UniqueArc}; > > pub use condvar::{new_condvar, CondVar, CondVarTimeoutResult}; > > diff --git a/rust/kernel/sync/rcu.rs b/rust/kernel/sync/rcu.rs > > new file mode 100644 > > index ..b51d9150ffe2 > > --- /dev/null > > +++ b/rust/kernel/sync/rcu.rs > > @@ -0,0 +1,47 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > + > > +//! RCU support. > > +//! > > +//! C header: > > [`include/linux/rcupdate.h`](srctree/include/linux/rcupdate.h) > > + > > +use crate::{bindings, types::NotThreadSafe}; > > + > > +/// Evidence that the RCU read side lock is held on the current thread/CPU. > > +/// > > +/// The type is explicitly not `Send` because this property is > > per-thread/CPU. > > +/// > > +/// # Invariants > > +/// > > +/// The RCU read side lock is actually held while instances of this guard > > exist. > > +pub struct Guard(NotThreadSafe); > > + > > +impl Guard { > > +/// Acquires the RCU read side lock and returns a guard. > > +pub fn new() -> Self { > > +// SAFETY: An FFI call with no additional requirements. > > +unsafe { bindings::rcu_read_lock() }; > > +// INVARIANT: The RCU read side lock was just acquired above. > > +Self(NotThreadSafe) > > +} > > + > > +/// Explicitly releases the RCU read side lock. > > +pub fn unlock(self) {} > > I don't think there's need for this, `drop(rcu_guard)` is equally > clear. I don't mind one or the other, feel free to send a patch to remove it. :) > > There was a debate in Rust community about explicit lock methods, but > the conclusion was to not have it, > see https://github.com/rust-lang/rust/issues/81872. > > > +} > > + > > +impl Default for Guard { > > +fn default() -> Self { > > +Self::new() > > +} > > +} > > I don't think anyone would like to implicit acquire an RCU guard! I > believe you included this because clippy is yelling, but in this case > you shouldn't listen to clippy. Either suppress the warning or rename > `new` to `lock`. I picked up this patch from Wedson, so I can't tell for sure. I don't see any other reason for this though, so we could remove it. > > > + > > +impl Drop for Guard { > > +fn drop(&mut self) { > > +// SAFETY: By the type invariants, the RCU read side is locked, so > > it is ok to unlock it. > > +unsafe { bindings::rcu_read_unlock() }; > > +} > > +} > > + > > +/// Acquires the RCU read side lock. > > +pub fn read_lock() -> Guard { > > +Guard::new() > > +} >
[PATCH v7 2/2] selftests: tmpfs: Add kselftest support to tmpfs
Replace direct error handling with 'ksft_test_result_*' macros for better reporting. Test logs: Before change: - Without root error: unshare, errno 1 - With root No, output After change: - Without root TAP version 13 1..1 ok 2 # SKIP This test needs root to run! Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 - With root TAP version 13 1..1 ok 1 Test : Success Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Shivam Chaudhary --- .../selftests/tmpfs/bug-link-o-tmpfile.c | 28 +-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c index 657b64857e82..290f11a81d2b 100644 --- a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c +++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c @@ -41,39 +41,39 @@ int main(void) if (unshare(CLONE_NEWNS) == -1) { if (errno == ENOSYS || errno == EPERM) { - fprintf(stderr, "error: unshare, errno %d\n", errno); - return 4; + ksft_exit_skip("unshare() error: unshare, errno %d\n", errno); + } else { + ksft_exit_fail_msg("unshare() error: unshare, errno %d\n", errno); } - fprintf(stderr, "error: unshare, errno %d\n", errno); - return 1; } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { - fprintf(stderr, "error: mount '/', errno %d\n", errno); - return 1; + ksft_exit_fail_msg("mount() error: Root filesystem private mount: Fail %d\n", errno); } /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */ if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) { - fprintf(stderr, "error: mount tmpfs, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("mount() error: Mounting tmpfs on /tmp: Fail %d\n", errno); } fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); if (fd == -1) { - fprintf(stderr, "error: open 1, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("openat() error: Open first temporary file: Fail %d\n", errno); } + if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) { - fprintf(stderr, "error: linkat, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("linkat() error: Linking the temporary file: Fail %d\n", errno); + /* Ensure fd is closed on failure */ + close(fd); } close(fd); fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); if (fd == -1) { - fprintf(stderr, "error: open 2, errno %d\n", errno); - return 1; + ksft_exit_fail_msg("openat() error: Opening the second temporary file: Fail %d\n", errno); } + ksft_test_result_pass("Test : Success\n"); + ksft_exit_pass(); return 0; } -- 2.34.1
[PATCH v7 1/2] selftests: tmpfs: Add Test-skip if not run as root
Add 'ksft_exit_skip()', if not run as root, with an appropriate Warning. Add 'ksft_print_header()' and 'ksft_set_plan()' to structure test outputs more effectively. Test logs: Before Change: - Without root error: unshare, errno 1 - With root No, output After change: - Without root TAP version 13 1..1 ok 2 # SKIP This test needs root to run! Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 - With root TAP version 13 1..1 Signed-off-by: Shivam Chaudhary --- tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c | 12 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c index b5c3ddb90942..657b64857e82 100644 --- a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c +++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c @@ -23,10 +23,22 @@ #include #include +#include "../kselftest.h" + int main(void) { int fd; + // Setting up kselftest framework + ksft_print_header(); + ksft_set_plan(1); + + // Check if test is run as root + if (geteuid()) { + ksft_exit_skip("This test needs root to run!\n"); + return 1; + } + if (unshare(CLONE_NEWNS) == -1) { if (errno == ENOSYS || errno == EPERM) { fprintf(stderr, "error: unshare, errno %d\n", errno); -- 2.34.1
[PATCH v7 0/2] selftests: tmpfs: Add kselftest support
This version 7 patch series replace direct error handling methods with ksft macros, which provide better reporting.Currently, when the tmpfs test runs, it does not display any output if it passes,and if it fails (particularly when not run as root),it simply exits without any warning or message. This series of patch adds: 1. Add 'ksft_print_header()' and 'ksft_set_plan()' to structure test outputs more effectively. 2. skip if not run as root. 3. Replace direct error handling with 'ksft_test_result_*', macros for better reporting. v6->v7: - Improve the handling of failure macros. v6 v1: https://lore.kernel.org/all/20241219152929.4005003-2-cvam0...@gmail.com/ v6 v2: https://lore.kernel.org/all/20241219152929.4005003-3-cvam0...@gmail.com/ v5->v6: - Skip if not run as root. v5 v1: https://lore.kernel.org/all/20241112143056.565122-2-cvam0...@gmail.com/ v5 v2: https://lore.kernel.org/all/20241112143056.565122-3-cvam0...@gmail.com/ v4->v5: - Remove unnecessary pass messages. - Remove unnecessary use of KSFT_SKIP. - Add appropriate use of ksft_exit_fail_msg. v4 v1: https://lore.kernel.org/all/8db9feab-0600-440b-b4b2-042695a10...@linuxfoundation.org/ v4 v2: https://lore.kernel.org/all/63d5e3bb-9817-4a34-98fe-823a9cac7...@linuxfoundation.org/ v3->v4: - Start a patchset - Split patch into smaller patches to make it easy to review. Patch1 Replace 'ksft_test_result_skip' with 'KSFT_SKIP' during root run check. Patch2 Replace 'ksft_test_result_fail' with 'KSFT_SKIP' where fail does not make sense, or failure could be due to not unsupported APIs with appropriate warnings. v3: https://lore.kernel.org/all/20241028185756.111832-1-cvam0...@gmail.com/ v2->v3: - Remove extra ksft_set_plan() - Remove function for unshare() - Fix the comment style v2: https://lore.kernel.org/all/20241026191621.2860376-1-cvam0...@gmail.com/ v1->v2: - Make the commit message more clear. v1: https://lore.kernel.org/all/20241024200228.1075840-1-cvam0...@gmail.com/T/#u thanks Shivam Shivam Chaudhary (2): selftests: tmpfs: Add Test-skip if not run as root selftests: tmpfs: Add kselftest support to tmpfs .../selftests/tmpfs/bug-link-o-tmpfile.c | 40 --- 1 file changed, 26 insertions(+), 14 deletions(-) -- 2.34.1
Re: [PATCH v4 3/3] selftests: pci_endpoint: Migrate to Kselftest framework
Hello Mani, Vinod, On Thu, Jan 02, 2025 at 12:34:04PM +0530, Manivannan Sadhasivam wrote: > On Tue, Dec 31, 2024 at 08:33:57PM +0100, Niklas Cassel wrote: > > > > I have some patches that adds DMA_MEMCPY to dw-edma, but I'm not sure if > > the DWC eDMA hardware supports having both src and dst as PCI addresses, or > > if only one of them can be a PCI address (with the other one being a local > > address). > > > > If only one of them can be a PCI address, then I'm not sure if your > > suggested patch is correct. > > > > I don't see why that would be an issue. DMA_MEMCPY is independent of PCI/local > addresses. If a dmaengine driver support doing MEMCPY, then the dma cap should > be sufficient. As you said, if a controller supports both SLAVE and MEMCPY, > the > test currently errors out, which is wrong. While I am okay with your suggested change to pci-epf-test.c: > >- if (epf_test->dma_private) { > >+ if (!dma_has_cap(DMA_MEMCPY, > >epf_test->dma_chan_tx->device->cap_mask)) { Since this will ensure that a DMA driver implementing DMA_MEMCPY, which cannot be shared (has DMA_PRIVATE set), will not error out. What I'm trying to explain is that in: https://lore.kernel.org/linux-pci/Z2BW4CjdE1p50AhC@vaman/ https://lore.kernel.org/linux-pci/20241217090129.6dodrgi4tn7l3cod@thinkpad/ Vinod (any you) suggested that we should add support for prep_memcpy() (which implies also setting cap DMA_MEMCPY) in the dw-edma DMA driver. However, from section "6.3 Using the DMA" in the DWC databook, the DWC eDMA hardware only supports: - Transfer (copy) of a block of data from local memory to remote memory. - Transfer (copy) of a block of data from remote memory to local memory. Currently, we have: https://github.com/torvalds/linux/blob/v6.13-rc5/include/linux/dmaengine.h#L843-L844 https://github.com/torvalds/linux/blob/v6.13-rc5/drivers/dma/dw-edma/dw-edma-core.c#L215-L231 Where we can expose per-channel capabilities, so we set MEM_TO_DEV/DEV_TO_MEM per channel, however, these are returned in a struct dma_slave_caps *caps, so this is AFAICT only for DMA_SLAVE, not for DMA_MEMCPY. Looking at: https://github.com/torvalds/linux/blob/v6.13-rc5/include/linux/dmaengine.h#L975-L979 it seems that DMA_MEMCPY is always assumed to be MEM_TO_MEM. To me, it seems that we would either need a new dma_transaction_type (e.g. DMA_COPY) where we can set dir: MEM_TO_DEV, DEV_TO_MEM, or DEV_TO_DEV. (dw-edma would not support DEV_TO_DEV.) Or, if we should stick with DMA_MEMCPY, we would need another way of telling client drivers that only src or dst can be a remote address. Until this is solved, I think I will stop my work on adding DMA_MEMCPY to the dw-edma driver. Kind regards, Niklas
Re: [PATCH v7 08/16] rust: add devres abstraction
On Tue, Dec 24, 2024 at 09:53:23PM +, Gary Guo wrote: > On Thu, 19 Dec 2024 18:04:10 +0100 > Danilo Krummrich wrote: > > > Add a Rust abstraction for the kernel's devres (device resource > > management) implementation. > > > > The Devres type acts as a container to manage the lifetime and > > accessibility of device bound resources. Therefore it registers a > > devres callback and revokes access to the resource on invocation. > > > > Users of the Devres abstraction can simply free the corresponding > > resources in their Drop implementation, which is invoked when either the > > Devres instance goes out of scope or the devres callback leads to the > > resource being revoked, which implies a call to drop_in_place(). > > > > Signed-off-by: Danilo Krummrich > > --- > > MAINTAINERS| 1 + > > rust/helpers/device.c | 10 +++ > > rust/helpers/helpers.c | 1 + > > rust/kernel/devres.rs | 178 + > > rust/kernel/lib.rs | 1 + > > 5 files changed, 191 insertions(+) > > create mode 100644 rust/helpers/device.c > > create mode 100644 rust/kernel/devres.rs > > > > > > > > +pub struct Devres(Arc>); > > + > > +impl DevresInner { > > +fn new(dev: &Device, data: T, flags: Flags) -> > > Result>> { > > +let inner = Arc::pin_init( > > +pin_init!( DevresInner { > > +data <- Revocable::new(data), > > +}), > > +flags, > > +)?; > > + > > +// Convert `Arc` into a raw pointer and make devres > > own this reference until > > +// `Self::devres_callback` is called. > > +let data = inner.clone().into_raw(); > > + > > +// SAFETY: `devm_add_action` guarantees to call > > `Self::devres_callback` once `dev` is > > +// detached. > > +let ret = unsafe { > > +bindings::devm_add_action(dev.as_raw(), > > Some(Self::devres_callback), data as _) > > +}; > > + > > +if ret != 0 { > > +// SAFETY: We just created another reference to `inner` in > > order to pass it to > > +// `bindings::devm_add_action`. If `bindings::devm_add_action` > > fails, we have to drop > > +// this reference accordingly. > > +let _ = unsafe { Arc::from_raw(data) }; > > +return Err(Error::from_errno(ret)); > > +} > > + > > +Ok(inner) > > +} > > + > > +#[allow(clippy::missing_safety_doc)] > > +unsafe extern "C" fn devres_callback(ptr: *mut kernel::ffi::c_void) { > > +let ptr = ptr as *mut DevresInner; > > +// Devres owned this memory; now that we received the callback, > > drop the `Arc` and hence the > > +// reference. > > +// SAFETY: Safe, since we leaked an `Arc` reference to > > devm_add_action() in > > +// `DevresInner::new`. > > +let inner = unsafe { Arc::from_raw(ptr) }; > > + > > +inner.data.revoke(); > > +} > > +} > > + > > +impl Devres { > > +/// Creates a new [`Devres`] instance of the given `data`. The `data` > > encapsulated within the > > +/// returned `Devres` instance' `data` will be revoked once the device > > is detached. > > +pub fn new(dev: &Device, data: T, flags: Flags) -> Result { > > +let inner = DevresInner::new(dev, data, flags)?; > > + > > +Ok(Devres(inner)) > > +} > > + > > +/// Same as [`Devres::new`], but does not return a `Devres` instance. > > Instead the given `data` > > +/// is owned by devres and will be revoked / dropped, once the device > > is detached. > > +pub fn new_foreign_owned(dev: &Device, data: T, flags: Flags) -> > > Result { > > +let _ = DevresInner::new(dev, data, flags)?; > > + > > +Ok(()) > > +} > > +} > > + > > +impl Deref for Devres { > > +type Target = Revocable; > > + > > +fn deref(&self) -> &Self::Target { > > +&self.0.data > > +} > > +} > > + > > +impl Drop for Devres { > > +fn drop(&mut self) { > > +// Revoke the data, such that it gets dropped already and the > > actual resource is freed. > > +// > > +// `DevresInner` has to stay alive until the devres callback has > > been called. This is > > +// necessary since we don't know when `Devres` is dropped and > > calling > > +// `devm_remove_action()` instead could race with > > `devres_release_all()`. > > IIUC, the outcome of that race is the `WARN` if > devres_release_all takes the spinlock first and has already remvoed the > action? Yes, this was one issue. But I think there was another when you have a class `Registration` that is owned by a `Devres`, which holds private data that is encapsulated in a `Devres` too. I have this case in Nova where the DRM device' private data holds the PCI bar, and the DRM device registration has a reference of the corresponding DRM device. But maybe this also was something else. I will double check and if I can confir
Re: [PATCH v2 00/28] module: Use RCU instead of RCU-sched.
Hi Sebastian, Le 20/12/2024 à 18:41, Sebastian Andrzej Siewior a écrit : Hi, This is an updated version of the inital post after PeterZ made me aware that there are users outside of the module directory. The goal is replace the mix auf rcu_read_lock(), rcu_read_lock_sched() and preempt_disable() with just rcu_read_lock(). I've splitted it into smaller chunks which can be applied/ reviewed independently. I'm just not sure about the cfi patch (28/28) so I added just a comment instead. v1…v2: https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F20241205215102.hRywUW2A%40linutronix.de%2F&data=05%7C02%7Cchristophe.leroy%40csgroup.eu%7Cbad0e7f9344a47371a0808dd211fdefd%7C8b87af7d86474dc78df45f69a2011bb5%7C0%7C0%7C638703142988931970%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=a%2FV5g5zRqceJWedAAZJ4zt6XJpZ0Yqm%2BqKPrsItuEXs%3D&reserved=0 - Splitted into smaller patches. - Converted all users. How did you generate that cover letter ? It should contain the full list of files modified by the series, so that I can see if any of them is of interest to me without going into each patch. This is done automatically when you use 'git format-patch --cover-letter'. Christophe
Re: [PATCH v7 08/16] rust: add devres abstraction
On Thu, Jan 02, 2025 at 11:30:11AM +0100, Danilo Krummrich wrote: > On Tue, Dec 24, 2024 at 09:53:23PM +, Gary Guo wrote: > > On Thu, 19 Dec 2024 18:04:10 +0100 > > Danilo Krummrich wrote: > > > > > Add a Rust abstraction for the kernel's devres (device resource > > > management) implementation. > > > > > > The Devres type acts as a container to manage the lifetime and > > > accessibility of device bound resources. Therefore it registers a > > > devres callback and revokes access to the resource on invocation. > > > > > > Users of the Devres abstraction can simply free the corresponding > > > resources in their Drop implementation, which is invoked when either the > > > Devres instance goes out of scope or the devres callback leads to the > > > resource being revoked, which implies a call to drop_in_place(). > > > > > > Signed-off-by: Danilo Krummrich > > > --- > > > MAINTAINERS| 1 + > > > rust/helpers/device.c | 10 +++ > > > rust/helpers/helpers.c | 1 + > > > rust/kernel/devres.rs | 178 + > > > rust/kernel/lib.rs | 1 + > > > 5 files changed, 191 insertions(+) > > > create mode 100644 rust/helpers/device.c > > > create mode 100644 rust/kernel/devres.rs > > > > > > > > > > > > +pub struct Devres(Arc>); > > > + > > > +impl DevresInner { > > > +fn new(dev: &Device, data: T, flags: Flags) -> > > > Result>> { > > > +let inner = Arc::pin_init( > > > +pin_init!( DevresInner { > > > +data <- Revocable::new(data), > > > +}), > > > +flags, > > > +)?; > > > + > > > +// Convert `Arc` into a raw pointer and make devres > > > own this reference until > > > +// `Self::devres_callback` is called. > > > +let data = inner.clone().into_raw(); > > > + > > > +// SAFETY: `devm_add_action` guarantees to call > > > `Self::devres_callback` once `dev` is > > > +// detached. > > > +let ret = unsafe { > > > +bindings::devm_add_action(dev.as_raw(), > > > Some(Self::devres_callback), data as _) > > > +}; > > > + > > > +if ret != 0 { > > > +// SAFETY: We just created another reference to `inner` in > > > order to pass it to > > > +// `bindings::devm_add_action`. If > > > `bindings::devm_add_action` fails, we have to drop > > > +// this reference accordingly. > > > +let _ = unsafe { Arc::from_raw(data) }; > > > +return Err(Error::from_errno(ret)); > > > +} > > > + > > > +Ok(inner) > > > +} > > > + > > > +#[allow(clippy::missing_safety_doc)] > > > +unsafe extern "C" fn devres_callback(ptr: *mut kernel::ffi::c_void) { > > > +let ptr = ptr as *mut DevresInner; > > > +// Devres owned this memory; now that we received the callback, > > > drop the `Arc` and hence the > > > +// reference. > > > +// SAFETY: Safe, since we leaked an `Arc` reference to > > > devm_add_action() in > > > +// `DevresInner::new`. > > > +let inner = unsafe { Arc::from_raw(ptr) }; > > > + > > > +inner.data.revoke(); > > > +} > > > +} > > > + > > > +impl Devres { > > > +/// Creates a new [`Devres`] instance of the given `data`. The > > > `data` encapsulated within the > > > +/// returned `Devres` instance' `data` will be revoked once the > > > device is detached. > > > +pub fn new(dev: &Device, data: T, flags: Flags) -> Result { > > > +let inner = DevresInner::new(dev, data, flags)?; > > > + > > > +Ok(Devres(inner)) > > > +} > > > + > > > +/// Same as [`Devres::new`], but does not return a `Devres` > > > instance. Instead the given `data` > > > +/// is owned by devres and will be revoked / dropped, once the > > > device is detached. > > > +pub fn new_foreign_owned(dev: &Device, data: T, flags: Flags) -> > > > Result { > > > +let _ = DevresInner::new(dev, data, flags)?; > > > + > > > +Ok(()) > > > +} > > > +} > > > + > > > +impl Deref for Devres { > > > +type Target = Revocable; > > > + > > > +fn deref(&self) -> &Self::Target { > > > +&self.0.data > > > +} > > > +} > > > + > > > +impl Drop for Devres { > > > +fn drop(&mut self) { > > > +// Revoke the data, such that it gets dropped already and the > > > actual resource is freed. > > > +// > > > +// `DevresInner` has to stay alive until the devres callback has > > > been called. This is > > > +// necessary since we don't know when `Devres` is dropped and > > > calling > > > +// `devm_remove_action()` instead could race with > > > `devres_release_all()`. > > > > IIUC, the outcome of that race is the `WARN` if > > devres_release_all takes the spinlock first and has already remvoed the > > action? > > Yes, this was one issue. But I think there was another when you have a class > `Registration` that is
Re: [RFC PATCH 1/2] ptp: add PTP_SYS_OFFSET_STAT for xtstamping with status
On Wed, Dec 25, 2024 at 04:42:14PM -0800, Richard Cochran wrote: > On Mon, Dec 23, 2024 at 07:13:46PM +0100, Peter Hilber wrote: > > > The precise synchronization of the VM guest with its immediate > > environment can also be important; a VM guest may depend the decision > > about leap second smearing on its environment. > > I thought that the whole point of using a VM is to isolate the guests > from each other and the host. What you describe is a promiscuous > coupling between guest and host, and the kernel shouldn't be in > the business of supporting such behavior. > Why? There is already ptp_kvm etc. in the kernel. Would it be more acceptable to just announce leap seconds, but not whether to smear? > > Also, the administrative > > configuration choice may change over the lifetime of a system. > > Right, which is why we should keep those choices out of kernel space. > Kernel provides mechanism, not policy. > As discussed, the policy would be forwarded, not determined, by the kernel. If the policy would be forwarded via NTP (by the NTP server smearing or not smearing leap seconds), this would have the following disadvantages: - need to use an NTP server just for announcing leap seconds - redundancy of serving time both via NTP and via PTP clocks (for better precision) - no awareness about leap seconds in case of smearing. > > The intent is to also support (embedded) VM clients which are themselves > > not necessarily internetworked, which do not get a lot of maintenance, > > and which are not guaranteed to get an update within the typically less > > than 6 months between leap second announcement and occurrence. > > Again, I don't think the kernel should be the solution to guests that > lack networking. Instead, the place to fix the problem is at the > root, namely in the guests. > I do not understand. Is the point that guests should decide through another channel about leap second smearing? > > I agree that a device driver should not determine clock quality metrics. > > The intent is that the driver forwards metrics, if such are advertised > > by the device. These metrics should describe the accuracy etc. of the > > device itself. > > Overall, I don't trust devices to tell the truth about their > qualities. But putting that aside, we would need to see some kind of > commonality in hardware implementation to advertise their metrics. > However, AFAICT there is no such industry practice on the market. > I hope there will be some feedback from third parties (at least related to virtualization). > > The patch message should document this more clearly. The > > metrics can be determined e.g. by virtualization host user space > > software. The device driver would just expose the device metrics to user > > space. > > Again, host user space shouldn't misuse the kernel to share random > metrics with guest user space. Isn't there another way to share such > info from host to guest? > For sure. But the aim of this proposal is to have an interoperable time synchronization solution for VMs through a Virtio device. So the idea is to include metrics, if a consensus on their usefulness can be reached. AFAIU it is difficult to bypass the kernel for Virtio devices. Thanks for the discussion, Peter
Re: [RFC PATCH 1/2] ptp: add PTP_SYS_OFFSET_STAT for xtstamping with status
On Thu, Jan 02, 2025 at 05:11:01PM +0100, Peter Hilber wrote: > Would it be more acceptable to just announce leap seconds, but not > whether to smear? Up until now, leap second announcements were handled in user space, and the kernel played no role. > I do not understand. Is the point that guests should decide through > another channel about leap second smearing? Yes, that would make more sense to me. > I hope there will be some feedback from third parties (at least related > to virtualization). +1 I'm no VM expert, but I'd like to avoid tacking things onto the kernel PTP layer, unless there is a really strong justification. > For sure. But the aim of this proposal is to have an interoperable time > synchronization solution for VMs through a Virtio device. So the idea is > to include metrics, if a consensus on their usefulness can be reached. > AFAIU it is difficult to bypass the kernel for Virtio devices. Providing clock metrics only makes sense when there is some choice to be made based on those metrics. If the "limited" VM guests don't even have networking, then they have no choice but to accept the time from the VM host, right? In which case, the metrics do not provide any benefit to the guest. Or what am I missing? Thanks, Richard
Re: [PATCH v7 03/16] rust: implement `IdArray`, `IdTable` and `RawDeviceId`
On Tue, Dec 24, 2024 at 08:10:02PM +, Gary Guo wrote: > On Thu, 19 Dec 2024 18:04:05 +0100 > Danilo Krummrich wrote: > > > Most subsystems use some kind of ID to match devices and drivers. Hence, > > we have to provide Rust drivers an abstraction to register an ID table > > for the driver to match. > > > > Generally, those IDs are subsystem specific and hence need to be > > implemented by the corresponding subsystem. However, the `IdArray`, > > `IdTable` and `RawDeviceId` types provide a generalized implementation > > that makes the life of subsystems easier to do so. > > > > Co-developed-by: Wedson Almeida Filho > > Signed-off-by: Wedson Almeida Filho > > Co-developed-by: Gary Guo > > Signed-off-by: Gary Guo > > Co-developed-by: Fabien Parent > > Signed-off-by: Fabien Parent > > Signed-off-by: Danilo Krummrich > > Thank you for converting my prototype to a working patch. There's a nit below. > > > --- > > MAINTAINERS | 1 + > > rust/kernel/device_id.rs | 165 +++ > > rust/kernel/lib.rs | 6 ++ > > 3 files changed, 172 insertions(+) > > create mode 100644 rust/kernel/device_id.rs > > > > diff --git a/MAINTAINERS b/MAINTAINERS > > index 2ad58ed40079..3cfb68650347 100644 > > --- a/MAINTAINERS > > +++ b/MAINTAINERS > > @@ -7033,6 +7033,7 @@ F:include/linux/kobj* > > F: include/linux/property.h > > F: lib/kobj* > > F: rust/kernel/device.rs > > +F: rust/kernel/device_id.rs > > F: rust/kernel/driver.rs > > > > DRIVERS FOR OMAP ADAPTIVE VOLTAGE SCALING (AVS) > > diff --git a/rust/kernel/device_id.rs b/rust/kernel/device_id.rs > > new file mode 100644 > > index ..e5859217a579 > > --- /dev/null > > +++ b/rust/kernel/device_id.rs > > > > > > > > + > > +impl RawIdArray { > > +#[doc(hidden)] > > +pub const fn size(&self) -> usize { > > +core::mem::size_of::() > > +} > > +} > > + > > This is not necessary, see below. > > > > > > > + > > +/// Create device table alias for modpost. > > +#[macro_export] > > +macro_rules! module_device_table { > > +($table_type: literal, $module_table_name:ident, $table_name:ident) => > > { > > +#[rustfmt::skip] > > +#[export_name = > > +concat!("__mod_device_table__", $table_type, > > +"__", module_path!(), > > +"_", line!(), > > +"_", stringify!($table_name)) > > +] > > +static $module_table_name: [core::mem::MaybeUninit; > > $table_name.raw_ids().size()] = > > +unsafe { core::mem::transmute_copy($table_name.raw_ids()) }; > > const_size_of_val will be stable in Rust 1.85, so we can start using the > feature and > > static $module_table_name: [core::mem::MaybeUninit; > core::mem::size_of_val($table_name.raw_ids())] = > unsafe { core::mem::transmute_copy($table_name.raw_ids()) }; > > should work. Thanks for the note, this indeed saves a few lines. Since the series has been applied already, feel free to send a patch. > > > +}; > > +} > > diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs > > index 7818407f9aac..66149ac5c0c9 100644 > > --- a/rust/kernel/lib.rs > > +++ b/rust/kernel/lib.rs > > @@ -18,6 +18,11 @@ > > #![feature(inline_const)] > > #![feature(lint_reasons)] > > #![feature(unsize)] > > +// Stable in Rust 1.83 > > +#![feature(const_maybe_uninit_as_mut_ptr)] > > +#![feature(const_mut_refs)] > > +#![feature(const_ptr_write)] > > +#![feature(const_refs_to_cell)] > > > > // Ensure conditional compilation based on the kernel configuration works; > > // otherwise we may silently break things like initcall handling. > > @@ -35,6 +40,7 @@ > > mod build_assert; > > pub mod cred; > > pub mod device; > > +pub mod device_id; > > pub mod driver; > > pub mod error; > > #[cfg(CONFIG_RUST_FW_LOADER_ABSTRACTIONS)] >
[PATCH] selftests: livepatch: test if ftrace can trace a livepatched function
This new test makes sure that ftrace can trace a function that was introduced by a livepatch. Signed-off-by: Filipe Xavier --- tools/testing/selftests/livepatch/test-ftrace.sh | 37 1 file changed, 37 insertions(+) diff --git a/tools/testing/selftests/livepatch/test-ftrace.sh b/tools/testing/selftests/livepatch/test-ftrace.sh index fe14f248913acbec46fb6c0fec38a2fc84209d39..5f0d5308c88669e84210393ce7b8aa138b694ebd 100755 --- a/tools/testing/selftests/livepatch/test-ftrace.sh +++ b/tools/testing/selftests/livepatch/test-ftrace.sh @@ -61,4 +61,41 @@ livepatch: '$MOD_LIVEPATCH': unpatching complete % rmmod $MOD_LIVEPATCH" +# - verify livepatch can load +# - check traces if have a patched function +# - unload livepatch and reset trace + +start_test "livepatch trace patched function and check that the live patch remains in effect" + +TRACE_FILE="$SYSFS_DEBUG_DIR/tracing/trace" +FUNCTION_NAME="livepatch_cmdline_proc_show" + +load_lp $MOD_LIVEPATCH + +echo $FUNCTION_NAME > $SYSFS_DEBUG_DIR/tracing/set_ftrace_filter +echo "function" > $SYSFS_DEBUG_DIR/tracing/current_tracer +echo "" > $TRACE_FILE + +if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then + echo -e "FAIL\n\n" + die "livepatch kselftest(s) failed" +fi + +grep -q $FUNCTION_NAME $TRACE_FILE +FOUND=$? + +disable_lp $MOD_LIVEPATCH +unload_lp $MOD_LIVEPATCH + +# Reset tracing +echo "nop" > $SYSFS_DEBUG_DIR/tracing/current_tracer +echo "" > $SYSFS_DEBUG_DIR/tracing/set_ftrace_filter +echo "" > $TRACE_FILE + +if [ "$FOUND" -eq 1 ]; then + echo -e "FAIL\n\n" + die "livepatch kselftest(s) failed" +fi + + exit 0 --- base-commit: fc033cf25e612e840e545f8d5ad2edd6ba613ed5 change-id: 20250101-ftrace-selftest-livepatch-161fb77dbed8 Best regards, -- Filipe Xavier