On 21 Jul 2025, at 7:46, wang lian wrote: > Add tests for process_madvise(), focusing on verifying behavior under > various conditions including valid usage and error cases. > > Signed-off-by: wang lian <lianux...@gmail.com> > Suggested-by: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> > Suggested-by: David Hildenbrand <da...@redhat.com> > Suggested-by: Zi Yan <z...@nvidia.com> > Suggested-by: Mark Brown <broo...@kernel.org> > Acked-by: SeongJae Park <s...@kernel.org> > > --- > Changelog v6: > - Refactor child process and pidfd management to use the kselftest > fixture's setup and teardown mechanism. This ensures that child > processes are reliably terminated and file descriptors are closed, even > when a test is aborted by an ASSERT or SKIP macro. This resolves the > issue where a failed assertion could lead to a leaked child process. > > Changelog v5: > https://lore.kernel.org/lkml/20250714122533.3135-1-lianux...@gmail.com/ > - Refactor the remote_collapse test to concentrate on its primary goal > confirming the successful remote invocation of process_madvise() on a child > process. > - Split the validation logic for invalid pidfds out of the remote test and > into two new > (`exited_process_pidfd` and `bad_pidfd`). > - Based mm-new branch, can ensure clean application > > > Changelog v4: > https://lore.kernel.org/lkml/20250710112249.58722-1-lianux...@gmail.com/ > - Refine resource cleanup logic in test teardown to be more robust. > - Improve remote_collapse test to correctly handle different THP > (Transparent Huge Page) policies ('always', 'madvise', 'never'), > including handling race conditions with khugepaged. > - Resolve build errors > > Changelog v3: > https://lore.kernel.org/lkml/20250703044326.65061-1-lianux...@gmail.com/ > - Rebased onto the latest mm-stable branch to ensure clean application. > - Refactor common signal handling logic into vm_util to reduce code > duplication. > - Improve test robustness and diagnostics based on community feedback. > - Address minor code style and script corrections. > > Changelog v2: > https://lore.kernel.org/lkml/20250630140957.4000-1-lianux...@gmail.com/ > - Drop MADV_DONTNEED tests based on feedback. > - Focus solely on process_madvise() syscall. > - Improve error handling and structure. > - Add future-proof flag test. > - Style and comment cleanups. > > -V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux...@gmail.com/ > tools/testing/selftests/mm/.gitignore | 1 + > tools/testing/selftests/mm/Makefile | 1 + > tools/testing/selftests/mm/process_madv.c | 302 ++++++++++++++++++++++ > tools/testing/selftests/mm/run_vmtests.sh | 5 + > 4 files changed, 309 insertions(+) > create mode 100644 tools/testing/selftests/mm/process_madv.c > > diff --git a/tools/testing/selftests/mm/.gitignore > b/tools/testing/selftests/mm/.gitignore > index f2dafa0b700b..e7b23a8a05fe > --- a/tools/testing/selftests/mm/.gitignore > +++ b/tools/testing/selftests/mm/.gitignore > @@ -21,6 +21,7 @@ on-fault-limit > transhuge-stress > pagemap_ioctl > pfnmap > +process_madv > *.tmp* > protection_keys > protection_keys_32 > diff --git a/tools/testing/selftests/mm/Makefile > b/tools/testing/selftests/mm/Makefile > index ae6f994d3add..d13b3cef2a2b 100644 > --- a/tools/testing/selftests/mm/Makefile > +++ b/tools/testing/selftests/mm/Makefile > @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test > TEST_GEN_FILES += on-fault-limit > TEST_GEN_FILES += pagemap_ioctl > TEST_GEN_FILES += pfnmap > +TEST_GEN_FILES += process_madv > TEST_GEN_FILES += thuge-gen > TEST_GEN_FILES += transhuge-stress > TEST_GEN_FILES += uffd-stress > diff --git a/tools/testing/selftests/mm/process_madv.c > b/tools/testing/selftests/mm/process_madv.c > new file mode 100644 > index 000000000000..8a83eac3bfab > --- /dev/null > +++ b/tools/testing/selftests/mm/process_madv.c > @@ -0,0 +1,302 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > + > +#define _GNU_SOURCE > +#include "../kselftest_harness.h" > +#include <errno.h> > +#include <setjmp.h> > +#include <signal.h> > +#include <stdbool.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <linux/mman.h> > +#include <sys/syscall.h> > +#include <unistd.h> > +#include <sched.h> > +#include "vm_util.h" > + > +#include "../pidfd/pidfd.h" > + > +FIXTURE(process_madvise) > +{ > + unsigned long page_size; > + pid_t child_pid; > + int remote_pidfd; > + int pidfd; > +}; > + > +FIXTURE_SETUP(process_madvise) > +{ > + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); > + self->pidfd = PIDFD_SELF; > + self->remote_pidfd = -1; > + self->child_pid = -1; > +}; > + > +FIXTURE_TEARDOWN_PARENT(process_madvise) > +{ > + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ > + if (self->child_pid > 0) { > + kill(self->child_pid, SIGKILL); > + waitpid(self->child_pid, NULL, 0); > + } > + > + if (self->remote_pidfd >= 0) > + close(self->remote_pidfd); > +} > + > +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, > + size_t vlen, int advice, unsigned int flags) > +{ > + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); > +} > + > +/* > + * This test uses PIDFD_SELF to target the current process. The main > + * goal is to verify the basic behavior of process_madvise() with > + * a vector of non-contiguous memory ranges, not its cross-process > + * capabilities. > + */ > +TEST_F(process_madvise, basic) > +{ > + const unsigned long pagesize = self->page_size; > + const int madvise_pages = 4; > + struct iovec vec[madvise_pages]; > + int pidfd = self->pidfd; > + ssize_t ret; > + char *map; > + > + /* > + * Create a single large mapping. We will pick pages from this > + * mapping to advise on. This ensures we test non-contiguous iovecs. > + */ > + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (map == MAP_FAILED) > + SKIP(return, "mmap failed, not enough memory.\n"); > + > + /* Fill the entire region with a known pattern. */ > + memset(map, 'A', pagesize * 10); > + > + /* > + * Setup the iovec to point to 4 non-contiguous pages > + * within the mapping. > + */ > + vec[0].iov_base = &map[0 * pagesize]; > + vec[0].iov_len = pagesize; > + vec[1].iov_base = &map[3 * pagesize]; > + vec[1].iov_len = pagesize; > + vec[2].iov_base = &map[5 * pagesize]; > + vec[2].iov_len = pagesize; > + vec[3].iov_base = &map[8 * pagesize]; > + vec[3].iov_len = pagesize; > + > + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); > + if (ret == -1 && errno == EPERM) > + SKIP(return, > + "process_madvise() unsupported or permission denied, > try running as root.\n"); > + else if (errno == EINVAL) > + SKIP(return, > + "process_madvise() unsupported or parameter invalid, > please check arguments.\n"); > + > + /* The call should succeed and report the total bytes processed. */ > + ASSERT_EQ(ret, madvise_pages * pagesize); > + > + /* Check that advised pages are now zero. */ > + for (int i = 0; i < madvise_pages; i++) { > + char *advised_page = (char *)vec[i].iov_base; > + > + /* Content must be 0, not 'A'. */ > + ASSERT_EQ(*advised_page, '\0'); > + } > + > + /* Check that an un-advised page in between is still 'A'. */ > + char *unadvised_page = &map[1 * pagesize]; > + > + for (int i = 0; i < pagesize; i++) > + ASSERT_EQ(unadvised_page[i], 'A'); > + > + /* Cleanup. */ > + ASSERT_EQ(munmap(map, pagesize * 10), 0); > +} > + > +/* > + * This test deterministically validates process_madvise() with MADV_COLLAPSE > + * on a remote process, other advices are difficult to verify reliably. > + * > + * The test verifies that a memory region in a child process, > + * focus on process_madv remote result, only check addresses and lengths. > + * The correctness of the MADV_COLLAPSE can be found in the relevant test > examples in khugepaged. > + */ > +TEST_F(process_madvise, remote_collapse) > +{ > + const unsigned long pagesize = self->page_size; > + long huge_page_size; > + int pipe_info[2]; > + ssize_t ret; > + struct iovec vec; > + > + struct child_info { > + pid_t pid; > + void *map_addr; > + } info; > + > + huge_page_size = default_huge_page_size();
You should use read_pmd_pagesize() here, since default_huge_page_size() is used for hugetlb. See Documentation/admin-guide/mm/hugetlbpage.rst. MADV_COLLAPSE is for THP. The test passes if HugeTLB size is 2M and fails otherwise. > + if (huge_page_size <= 0) > + SKIP(return, "Could not determine a valid huge page size.\n"); > + > + ASSERT_EQ(pipe(pipe_info), 0); > + > + self->child_pid = fork(); > + ASSERT_NE(self->child_pid, -1); > + > + if (self->child_pid == 0) { > + char *map; > + size_t map_size = 2 * huge_page_size; > + > + close(pipe_info[0]); > + > + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + ASSERT_NE(map, MAP_FAILED); > + > + /* Fault in as small pages */ > + for (size_t i = 0; i < map_size; i += pagesize) > + map[i] = 'A'; > + > + /* Send info and pause */ > + info.pid = getpid(); > + info.map_addr = map; > + ret = write(pipe_info[1], &info, sizeof(info)); > + ASSERT_EQ(ret, sizeof(info)); > + close(pipe_info[1]); > + > + pause(); > + exit(0); > + } > + > + close(pipe_info[1]); > + > + /* Receive child info */ > + ret = read(pipe_info[0], &info, sizeof(info)); > + if (ret <= 0) { > + waitpid(self->child_pid, NULL, 0); > + SKIP(return, "Failed to read child info from pipe.\n"); > + } > + ASSERT_EQ(ret, sizeof(info)); > + close(pipe_info[0]); > + self->child_pid = info.pid; > + > + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); > + ASSERT_GE(self->remote_pidfd, 0); > + > + vec.iov_base = info.map_addr; > + vec.iov_len = huge_page_size; > + > + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, > 0); > + if (ret == -1) { > + if (errno == EINVAL) > + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); > + else if (errno == EPERM) > + SKIP(return, > + "No process_madvise() permissions, try > running as root.\n"); > + return; > + } > + > + ASSERT_EQ(ret, huge_page_size); > +} > + > +/* > + * Test process_madvise() with a pidfd for a process that has already > + * exited to ensure correct error handling. > + */ > +TEST_F(process_madvise, exited_process_pidfd) > +{ > + struct iovec vec; > + ssize_t ret; > + int pidfd; > + > + vec.iov_base = (void *)0x1234; > + vec.iov_len = 4096; Here an invalid address range is provided, since pid is checked before address ranges are checked. BTW, the size of iovec array cannot be bigger than IOV_MAX. It might be worth testing as well, if you want to. With default_huge_page_size() -> read_pmd_pagesize() fix, feel free to add: Reviewed-by: Zi Yan <z...@nvidia.com> Tested-by: Zi Yan <z...@nvidia.com> I am able to compile and run the test on arm64. Thanks. Best Regards, Yan, Zi