Add tests for process_madvise(), focusing on verifying behavior under
various conditions including valid usage and error cases.

Signed-off-by: wang lian <lianux...@gmail.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoa...@oracle.com>
Suggested-by: David Hildenbrand <da...@redhat.com>
Suggested-by: Zi Yan <z...@nvidia.com>
Suggested-by: Mark Brown <broo...@kernel.org>
Acked-by: SeongJae Park <s...@kernel.org>

---
Changelog v6:
- Refactor child process and pidfd management to use the kselftest
  fixture's setup and teardown mechanism. This ensures that child
  processes are reliably terminated and file descriptors are closed, even
  when a test is aborted by an ASSERT or SKIP macro. This resolves the
  issue where a failed assertion could lead to a leaked child process.

Changelog v5: 
https://lore.kernel.org/lkml/20250714122533.3135-1-lianux...@gmail.com/
- Refactor the remote_collapse test to concentrate on its primary goal
  confirming the successful remote invocation of process_madvise() on a child 
process.
- Split the validation logic for invalid pidfds out of the remote test and into 
two new
  (`exited_process_pidfd` and `bad_pidfd`).
- Based mm-new branch, can ensure clean application


Changelog v4: 
https://lore.kernel.org/lkml/20250710112249.58722-1-lianux...@gmail.com/
- Refine resource cleanup logic in test teardown to be more robust.
- Improve remote_collapse test to correctly handle different THP
  (Transparent Huge Page) policies ('always', 'madvise', 'never'),
  including handling race conditions with khugepaged.
- Resolve build errors

Changelog v3: 
https://lore.kernel.org/lkml/20250703044326.65061-1-lianux...@gmail.com/
- Rebased onto the latest mm-stable branch to ensure clean application.
- Refactor common signal handling logic into vm_util to reduce code duplication.
- Improve test robustness and diagnostics based on community feedback.
- Address minor code style and script corrections.

Changelog v2: 
https://lore.kernel.org/lkml/20250630140957.4000-1-lianux...@gmail.com/
- Drop MADV_DONTNEED tests based on feedback.
- Focus solely on process_madvise() syscall.
- Improve error handling and structure.
- Add future-proof flag test.
- Style and comment cleanups.

-V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux...@gmail.com/
 tools/testing/selftests/mm/.gitignore     |   1 +
 tools/testing/selftests/mm/Makefile       |   1 +
 tools/testing/selftests/mm/process_madv.c | 302 ++++++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh |   5 +
 4 files changed, 309 insertions(+)
 create mode 100644 tools/testing/selftests/mm/process_madv.c

diff --git a/tools/testing/selftests/mm/.gitignore 
b/tools/testing/selftests/mm/.gitignore
index f2dafa0b700b..e7b23a8a05fe 
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -21,6 +21,7 @@ on-fault-limit
 transhuge-stress
 pagemap_ioctl
 pfnmap
+process_madv
 *.tmp*
 protection_keys
 protection_keys_32
diff --git a/tools/testing/selftests/mm/Makefile 
b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += pagemap_ioctl
 TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/process_madv.c 
b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..8a83eac3bfab
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include "vm_util.h"
+
+#include "../pidfd/pidfd.h"
+
+FIXTURE(process_madvise)
+{
+       unsigned long page_size;
+       pid_t child_pid;
+       int remote_pidfd;
+       int pidfd;
+};
+
+FIXTURE_SETUP(process_madvise)
+{
+       self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+       self->pidfd = PIDFD_SELF;
+       self->remote_pidfd = -1;
+       self->child_pid = -1;
+};
+
+FIXTURE_TEARDOWN_PARENT(process_madvise)
+{
+       /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
+       if (self->child_pid > 0) {
+               kill(self->child_pid, SIGKILL);
+               waitpid(self->child_pid, NULL, 0);
+       }
+
+       if (self->remote_pidfd >= 0)
+               close(self->remote_pidfd);
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+                                  size_t vlen, int advice, unsigned int flags)
+{
+       return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
+}
+
+/*
+ * This test uses PIDFD_SELF to target the current process. The main
+ * goal is to verify the basic behavior of process_madvise() with
+ * a vector of non-contiguous memory ranges, not its cross-process
+ * capabilities.
+ */
+TEST_F(process_madvise, basic)
+{
+       const unsigned long pagesize = self->page_size;
+       const int madvise_pages = 4;
+       struct iovec vec[madvise_pages];
+       int pidfd = self->pidfd;
+       ssize_t ret;
+       char *map;
+
+       /*
+        * Create a single large mapping. We will pick pages from this
+        * mapping to advise on. This ensures we test non-contiguous iovecs.
+        */
+       map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (map == MAP_FAILED)
+               SKIP(return, "mmap failed, not enough memory.\n");
+
+       /* Fill the entire region with a known pattern. */
+       memset(map, 'A', pagesize * 10);
+
+       /*
+        * Setup the iovec to point to 4 non-contiguous pages
+        * within the mapping.
+        */
+       vec[0].iov_base = &map[0 * pagesize];
+       vec[0].iov_len = pagesize;
+       vec[1].iov_base = &map[3 * pagesize];
+       vec[1].iov_len = pagesize;
+       vec[2].iov_base = &map[5 * pagesize];
+       vec[2].iov_len = pagesize;
+       vec[3].iov_base = &map[8 * pagesize];
+       vec[3].iov_len = pagesize;
+
+       ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
+       if (ret == -1 && errno == EPERM)
+               SKIP(return,
+                          "process_madvise() unsupported or permission denied, 
try running as root.\n");
+       else if (errno == EINVAL)
+               SKIP(return,
+                          "process_madvise() unsupported or parameter invalid, 
please check arguments.\n");
+
+       /* The call should succeed and report the total bytes processed. */
+       ASSERT_EQ(ret, madvise_pages * pagesize);
+
+       /* Check that advised pages are now zero. */
+       for (int i = 0; i < madvise_pages; i++) {
+               char *advised_page = (char *)vec[i].iov_base;
+
+               /* Content must be 0, not 'A'. */
+               ASSERT_EQ(*advised_page, '\0');
+       }
+
+       /* Check that an un-advised page in between is still 'A'. */
+       char *unadvised_page = &map[1 * pagesize];
+
+       for (int i = 0; i < pagesize; i++)
+               ASSERT_EQ(unadvised_page[i], 'A');
+
+       /* Cleanup. */
+       ASSERT_EQ(munmap(map, pagesize * 10), 0);
+}
+
+/*
+ * This test deterministically validates process_madvise() with MADV_COLLAPSE
+ * on a remote process, other advices are difficult to verify reliably.
+ *
+ * The test verifies that a memory region in a child process,
+ * focus on process_madv remote result, only check addresses and lengths.
+ * The correctness of the MADV_COLLAPSE can be found in the relevant test 
examples in khugepaged.
+ */
+TEST_F(process_madvise, remote_collapse)
+{
+       const unsigned long pagesize = self->page_size;
+       long huge_page_size;
+       int pipe_info[2];
+       ssize_t ret;
+       struct iovec vec;
+
+       struct child_info {
+               pid_t pid;
+               void *map_addr;
+       } info;
+
+       huge_page_size = default_huge_page_size();
+       if (huge_page_size <= 0)
+               SKIP(return, "Could not determine a valid huge page size.\n");
+
+       ASSERT_EQ(pipe(pipe_info), 0);
+
+       self->child_pid = fork();
+       ASSERT_NE(self->child_pid, -1);
+
+       if (self->child_pid == 0) {
+               char *map;
+               size_t map_size = 2 * huge_page_size;
+
+               close(pipe_info[0]);
+
+               map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+               ASSERT_NE(map, MAP_FAILED);
+
+               /* Fault in as small pages */
+               for (size_t i = 0; i < map_size; i += pagesize)
+                       map[i] = 'A';
+
+               /* Send info and pause */
+               info.pid = getpid();
+               info.map_addr = map;
+               ret = write(pipe_info[1], &info, sizeof(info));
+               ASSERT_EQ(ret, sizeof(info));
+               close(pipe_info[1]);
+
+               pause();
+               exit(0);
+       }
+
+       close(pipe_info[1]);
+
+       /* Receive child info */
+       ret = read(pipe_info[0], &info, sizeof(info));
+       if (ret <= 0) {
+               waitpid(self->child_pid, NULL, 0);
+               SKIP(return, "Failed to read child info from pipe.\n");
+       }
+       ASSERT_EQ(ret, sizeof(info));
+       close(pipe_info[0]);
+       self->child_pid = info.pid;
+
+       self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+       ASSERT_GE(self->remote_pidfd, 0);
+
+       vec.iov_base = info.map_addr;
+       vec.iov_len = huge_page_size;
+
+       ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 
0);
+       if (ret == -1) {
+               if (errno == EINVAL)
+                       SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
+               else if (errno == EPERM)
+                       SKIP(return,
+                                  "No process_madvise() permissions, try 
running as root.\n");
+               return;
+       }
+
+       ASSERT_EQ(ret, huge_page_size);
+}
+
+/*
+ * Test process_madvise() with a pidfd for a process that has already
+ * exited to ensure correct error handling.
+ */
+TEST_F(process_madvise, exited_process_pidfd)
+{
+       struct iovec vec;
+       ssize_t ret;
+       int pidfd;
+
+       vec.iov_base = (void *)0x1234;
+       vec.iov_len = 4096;
+
+       /*
+        * Using a pidfd for a process that has already exited should fail
+        * with ESRCH.
+        */
+       self->child_pid = fork();
+       ASSERT_NE(self->child_pid, -1);
+
+       if (self->child_pid == 0)
+               exit(0);
+
+       pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
+       ASSERT_GE(pidfd, 0);
+
+       /* Wait for the child to ensure it has terminated. */
+       waitpid(self->child_pid, NULL, 0);
+
+       ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0);
+       ASSERT_EQ(ret, -1);
+       ASSERT_EQ(errno, ESRCH);
+       close(pidfd);
+}
+
+/*
+ * Test process_madvise() with bad pidfds to ensure correct error
+ * handling.
+ */
+TEST_F(process_madvise, bad_pidfd)
+{
+       struct iovec vec;
+       ssize_t ret;
+
+       vec.iov_base = (void *)0x1234;
+       vec.iov_len = 4096;
+
+       /* Using an invalid fd number (-1) should fail with EBADF. */
+       ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
+       ASSERT_EQ(ret, -1);
+       ASSERT_EQ(errno, EBADF);
+
+       /*
+        * Using a valid fd that is not a pidfd (e.g. stdin) should fail
+        * with EBADF.
+        */
+       ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
+       ASSERT_EQ(ret, -1);
+       ASSERT_EQ(errno, EBADF);
+}
+
+/*
+ * Test process_madvise() with an invalid flag value. Currently, only a flag
+ * value of 0 is supported. This test is reserved for the future, e.g., if
+ * synchronous flags are added.
+ */
+TEST_F(process_madvise, flag)
+{
+       const unsigned long pagesize = self->page_size;
+       unsigned int invalid_flag;
+       int pidfd = self->pidfd;
+       struct iovec vec;
+       char *map;
+       ssize_t ret;
+
+       map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+                  0);
+       if (map == MAP_FAILED)
+               SKIP(return, "mmap failed, not enough memory.\n");
+
+       vec.iov_base = map;
+       vec.iov_len = pagesize;
+
+       invalid_flag = 0x80000000;
+
+       ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
+       ASSERT_EQ(ret, -1);
+       ASSERT_EQ(errno, EINVAL);
+
+       /* Cleanup. */
+       ASSERT_EQ(munmap(map, pagesize), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh 
b/tools/testing/selftests/mm/run_vmtests.sh
index a38c984103ce..471e539d82b8 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -65,6 +65,8 @@ separated by spaces:
        test pagemap_scan IOCTL
 - pfnmap
        tests for VM_PFNMAP handling
+- process_madv
+       test for process_madv
 - cow
        test copy-on-write semantics
 - thp
@@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions
 # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
 CATEGORY="madv_populate" run_test ./madv_populate
 
+# PROCESS_MADV test
+CATEGORY="process_madv" run_test ./process_madv
+
 CATEGORY="vma_merge" run_test ./merge
 
 if [ -x ./memfd_secret ]
-- 
2.43.0


Reply via email to