** Summary changed: - xfs large folio soft lockups + mm/folios: xfs hangs with hung task timeouts with corrupted folio pointer lists
** Tags added: sts ** Description changed: - Placeholder bug for xfs large folio softlockup issues. + [Impact] + + A long running, and incredibly difficult to reproduce large folio issue leads to + hung task timeouts in the xfs subsystem with the following stack trace: + + CPU: 0 PID: 226487 Comm: xfs_io Tainted: G L 6.5.0-41-generic #41~22.04.2-Ubuntu + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:xas_descend+0x25/0xd0 + Code: 90 90 90 90 90 55 48 89 e5 41 56 41 55 49 89 fd 41 54 49 89 f4 53 48 83 ec 08 0f b6 0e 48 8b 5f 08 80 f9 3f 0f 87 5d 2f 07 00 <48> d3 eb 83 e3 3f 89 d8 48 83 c0 04 49 8b 44 c4 08 4d 89 65 18 48 + RSP: 0018:ffffaf9b44927a68 EFLAGS: 00000293 + RAX: ffff8d61568f36d2 RBX: 00000000000005c0 RCX: 0000000000000006 + RDX: 0000000000000002 RSI: ffff8d61568f36d0 RDI: ffffaf9b44927b10 + RBP: ffffaf9b44927a90 R08: 0000000000000000 R09: 0000000000000000 + R10: ffff8d6159120938 R11: 0000000000000000 R12: ffff8d61568f36d0 + R13: ffffaf9b44927b10 R14: ffffaf9b44927e30 R15: ffffaf9b44927e08 + FS: 00007bcf4ce2c840(0000) GS:ffff8d61be400000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007bcf4ca80df8 CR3: 0000000008524005 CR4: 0000000000370ef0 + Call Trace: + <IRQ> + ? show_regs+0x6d/0x80 + ? watchdog_timer_fn+0x1d8/0x240 + ? __pfx_watchdog_timer_fn+0x10/0x10 + ? __hrtimer_run_queues+0x10f/0x2a0 + ? kvm_clock_get_cycles+0x18/0x40 + ? hrtimer_interrupt+0xf6/0x250 + ? __sysvec_apic_timer_interrupt+0x5f/0x140 + ? sysvec_apic_timer_interrupt+0x8d/0xd0 + </IRQ> + <TASK> + ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 + ? xas_descend+0x25/0xd0 + xas_load+0x4c/0x60 + __xas_next+0xa9/0x150 + filemap_get_read_batch+0x1a3/0x2e0 + filemap_get_pages+0xa9/0x3b0 + ? touch_atime+0x44/0x1c0 + filemap_read+0xe7/0x430 + generic_file_read_iter+0xbb/0x110 + ? down_read+0x12/0xc0 + xfs_file_buffered_read+0x57/0xe0 [xfs] + xfs_file_read_iter+0xb6/0x1c0 [xfs] + ? security_file_permission+0x5f/0x70 + vfs_read+0x20a/0x360 + __x64_sys_pread64+0xa6/0xd0 + x64_sys_call+0x1e01/0x20b0 + do_syscall_64+0x55/0x90 + ? do_syscall_64+0x61/0x90 + ? syscall_exit_to_user_mode+0x37/0x60 + ? do_syscall_64+0x61/0x90 + entry_SYSCALL_64_after_hwframe+0x73/0xdd + RIP: 0033:0x7bcf4cd1278f + Code: 08 89 3c 24 48 89 4c 24 18 e8 7d e2 f7 ff 4c 8b 54 24 18 48 8b 54 24 10 41 89 c0 48 8b 74 24 08 8b 3c 24 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 04 24 e8 bd e2 f7 ff 48 8b + RSP: 002b:00007fff220ed560 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 + RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007bcf4cd1278f + RDX: 0000000000010000 RSI: 0000623b5c5f2000 RDI: 0000000000000003 + RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 + R10: 00000000005c0000 R11: 0000000000000293 R12: 00000000005c0000 + R13: 000000001fa40000 R14: 00000000005c0000 R15: 0000000000000000 + </TASK> + watchdog: BUG: soft lockup - CPU#1 stuck for 417s! [xfs_io:226486] + + The transaction never recovers, and the system must be force restarted. Doing + this can lose data not yet written to disk. + + There is no workaround, other than to build your kernel disabling large folio + support for xfs. + + [Fix] + + The below patches fix the issue by more-or-less calling xas_reset() after + xas_split_alloc(), which ensures the folio pointer list doesn't get corrupted + if a race condition occurs. + + commit a4864671ca0bf51c8e78242951741df52c06766f + Author: Kairui Song <kas...@tencent.com> + Date: Tue Apr 16 01:18:55 2024 +0800 + Subject: lib/xarray: introduce a new helper xas_get_order + Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a4864671ca0bf51c8e78242951741df52c06766f + + commit de60fd8ddeda2b41fbe11df11733838c5f684616 + Author: Kairui Song <kas...@tencent.com> + Date: Tue Apr 16 01:18:53 2024 +0800 + Subject: mm/filemap: return early if failed to allocate memory for split + Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de60fd8ddeda2b41fbe11df11733838c5f684616 + + commit 6758c1128ceb45d1a35298912b974eb4895b7dd9 + Author: Kairui Song <kas...@tencent.com> + Date: Tue Apr 16 01:18:56 2024 +0800 + Subject: mm/filemap: optimize filemap folio adding + Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6758c1128ceb45d1a35298912b974eb4895b7dd9 + + These all landed in 6.10-rc1. For Noble, we will use the versions from upstream + -stable 6.6.y directly from Greg KH. They contain minor backports from the + mainline variants, but cherry pick directly to 6.8 noble. + + [Testcase] + + You will need a disk attached to your VM, or a bare metal system with multiple + disks. + + $ sudo lsblk + $ sudo mkfs.xfs /dev/vdc + + $ echo >> xfsfallout.bash << EOF + #!/bin/bash + sudo mount /dev/vdc /mnt + for x in {0..8}; do sudo fallocate -l100m /mnt/file${x}; sudo ./reader /mnt/file${x} & done + EOF + + $ echo >> reader.c << EOF + /* + * gcc -Wall -o reader reader.c -lpthread + */ + #define _GNU_SOURCE + + #include <stdio.h> + #include <stdlib.h> + #include <fcntl.h> + #include <sys/types.h> + #include <sys/stat.h> + #include <sys/mman.h> + #include <sys/sendfile.h> + #include <unistd.h> + #include <errno.h> + #include <err.h> + #include <pthread.h> + + struct thread_data { + int fd; + size_t size; + }; + + static void *drop_pages(void *arg) + { + struct thread_data *td = arg; + int ret; + unsigned long nr_pages = td->size / 4096; + unsigned int seed = 0x55443322; + off_t offset; + unsigned long nr_drops = 0; + + while (1) { + offset = rand_r(&seed) % nr_pages; + offset = offset * 4096; + ret = posix_fadvise(td->fd, offset, 4096, POSIX_FADV_DONTNEED); + if (ret < 0) + err(1, "fadvise dontneed"); + + /* every once and a while, drop everything */ + if (nr_drops > nr_pages / 2) { + ret = posix_fadvise(td->fd, 0, td->size, POSIX_FADV_DONTNEED); + if (ret < 0) + err(1, "fadvise dontneed"); + fprintf(stderr, "+"); + nr_drops = 0; + } + nr_drops++; + } + return NULL; + } + + #define READ_BUF (2 * 1024 * 1024) + static void *read_pages(void *arg) + { + struct thread_data *td = arg; + char buf[READ_BUF]; + ssize_t ret; + loff_t offset; + + while (1) { + offset = 0; + while(offset < td->size) { + ret = pread(td->fd, buf, READ_BUF, offset); + if (ret < 0) + err(1, "read"); + if (ret == 0) + break; + offset += ret; + } + } + return NULL; + } + + int main(int ac, char **av) + { + int fd; + int ret; + struct stat st; + struct thread_data td; + pthread_t drop_tid; + pthread_t drop2_tid; + pthread_t read_tid; + + if (ac != 2) + err(1, "usage: reader filename\n"); + + fd = open(av[1], O_RDONLY, 0600); + if (fd < 0) + err(1, "unable to open %s", av[1]); + + ret = fstat(fd, &st); + if (ret < 0) + err(1, "stat"); + + td.fd = fd; + td.size = st.st_size; + + ret = pthread_create(&drop_tid, NULL, drop_pages, &td); + if (ret) + err(1, "pthread_create"); + ret = pthread_create(&drop2_tid, NULL, drop_pages, &td); + if (ret) + err(1, "pthread_create"); + ret = pthread_create(&read_tid, NULL, read_pages, &td); + if (ret) + err(1, "pthread_create"); + + pthread_join(drop_tid, NULL); + pthread_join(drop2_tid, NULL); + pthread_join(read_tid, NULL); + } + EOF + + $ sudo apt install build-essential + $ gcc -Wall -o reader reader.c -lpthread + $ ./xfsfallout.bash + + The kernel should hang in approximately 5 minutes or less. + + There is a test kernel available in the following ppas: + + https://launchpad.net/~mruffell/+archive/ubuntu/lp2085495-test + + If you install this, running the testcase should not hang the kernel, even + running the testcase for hours on end. + + [Where problems could occur] + + [Other info] + + Very detailed upstream mailing list thread: + https://lore.kernel.org/linux-mm/20240913-ortsausgang-baustart-1dae9a18254d@brauner/T/ ** Description changed: + BugLink: https://bugs.launchpad.net/bugs/2085495 + [Impact] A long running, and incredibly difficult to reproduce large folio issue leads to hung task timeouts in the xfs subsystem with the following stack trace: CPU: 0 PID: 226487 Comm: xfs_io Tainted: G L 6.5.0-41-generic #41~22.04.2-Ubuntu Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:xas_descend+0x25/0xd0 Code: 90 90 90 90 90 55 48 89 e5 41 56 41 55 49 89 fd 41 54 49 89 f4 53 48 83 ec 08 0f b6 0e 48 8b 5f 08 80 f9 3f 0f 87 5d 2f 07 00 <48> d3 eb 83 e3 3f 89 d8 48 83 c0 04 49 8b 44 c4 08 4d 89 65 18 48 RSP: 0018:ffffaf9b44927a68 EFLAGS: 00000293 RAX: ffff8d61568f36d2 RBX: 00000000000005c0 RCX: 0000000000000006 RDX: 0000000000000002 RSI: ffff8d61568f36d0 RDI: ffffaf9b44927b10 RBP: ffffaf9b44927a90 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8d6159120938 R11: 0000000000000000 R12: ffff8d61568f36d0 R13: ffffaf9b44927b10 R14: ffffaf9b44927e30 R15: ffffaf9b44927e08 FS: 00007bcf4ce2c840(0000) GS:ffff8d61be400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007bcf4ca80df8 CR3: 0000000008524005 CR4: 0000000000370ef0 Call Trace: - <IRQ> - ? show_regs+0x6d/0x80 - ? watchdog_timer_fn+0x1d8/0x240 - ? __pfx_watchdog_timer_fn+0x10/0x10 - ? __hrtimer_run_queues+0x10f/0x2a0 - ? kvm_clock_get_cycles+0x18/0x40 - ? hrtimer_interrupt+0xf6/0x250 - ? __sysvec_apic_timer_interrupt+0x5f/0x140 - ? sysvec_apic_timer_interrupt+0x8d/0xd0 - </IRQ> - <TASK> - ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 - ? xas_descend+0x25/0xd0 - xas_load+0x4c/0x60 - __xas_next+0xa9/0x150 - filemap_get_read_batch+0x1a3/0x2e0 - filemap_get_pages+0xa9/0x3b0 - ? touch_atime+0x44/0x1c0 - filemap_read+0xe7/0x430 - generic_file_read_iter+0xbb/0x110 - ? down_read+0x12/0xc0 - xfs_file_buffered_read+0x57/0xe0 [xfs] - xfs_file_read_iter+0xb6/0x1c0 [xfs] - ? security_file_permission+0x5f/0x70 - vfs_read+0x20a/0x360 - __x64_sys_pread64+0xa6/0xd0 - x64_sys_call+0x1e01/0x20b0 - do_syscall_64+0x55/0x90 - ? do_syscall_64+0x61/0x90 - ? syscall_exit_to_user_mode+0x37/0x60 - ? do_syscall_64+0x61/0x90 - entry_SYSCALL_64_after_hwframe+0x73/0xdd + <IRQ> + ? show_regs+0x6d/0x80 + ? watchdog_timer_fn+0x1d8/0x240 + ? __pfx_watchdog_timer_fn+0x10/0x10 + ? __hrtimer_run_queues+0x10f/0x2a0 + ? kvm_clock_get_cycles+0x18/0x40 + ? hrtimer_interrupt+0xf6/0x250 + ? __sysvec_apic_timer_interrupt+0x5f/0x140 + ? sysvec_apic_timer_interrupt+0x8d/0xd0 + </IRQ> + <TASK> + ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 + ? xas_descend+0x25/0xd0 + xas_load+0x4c/0x60 + __xas_next+0xa9/0x150 + filemap_get_read_batch+0x1a3/0x2e0 + filemap_get_pages+0xa9/0x3b0 + ? touch_atime+0x44/0x1c0 + filemap_read+0xe7/0x430 + generic_file_read_iter+0xbb/0x110 + ? down_read+0x12/0xc0 + xfs_file_buffered_read+0x57/0xe0 [xfs] + xfs_file_read_iter+0xb6/0x1c0 [xfs] + ? security_file_permission+0x5f/0x70 + vfs_read+0x20a/0x360 + __x64_sys_pread64+0xa6/0xd0 + x64_sys_call+0x1e01/0x20b0 + do_syscall_64+0x55/0x90 + ? do_syscall_64+0x61/0x90 + ? syscall_exit_to_user_mode+0x37/0x60 + ? do_syscall_64+0x61/0x90 + entry_SYSCALL_64_after_hwframe+0x73/0xdd RIP: 0033:0x7bcf4cd1278f Code: 08 89 3c 24 48 89 4c 24 18 e8 7d e2 f7 ff 4c 8b 54 24 18 48 8b 54 24 10 41 89 c0 48 8b 74 24 08 8b 3c 24 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 04 24 e8 bd e2 f7 ff 48 8b RSP: 002b:00007fff220ed560 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007bcf4cd1278f RDX: 0000000000010000 RSI: 0000623b5c5f2000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000005c0000 R11: 0000000000000293 R12: 00000000005c0000 R13: 000000001fa40000 R14: 00000000005c0000 R15: 0000000000000000 - </TASK> + </TASK> watchdog: BUG: soft lockup - CPU#1 stuck for 417s! [xfs_io:226486] The transaction never recovers, and the system must be force restarted. Doing this can lose data not yet written to disk. There is no workaround, other than to build your kernel disabling large folio support for xfs. [Fix] - The below patches fix the issue by more-or-less calling xas_reset() after + The below patches fix the issue by more-or-less calling xas_reset() after xas_split_alloc(), which ensures the folio pointer list doesn't get corrupted if a race condition occurs. commit a4864671ca0bf51c8e78242951741df52c06766f Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:55 2024 +0800 Subject: lib/xarray: introduce a new helper xas_get_order Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a4864671ca0bf51c8e78242951741df52c06766f commit de60fd8ddeda2b41fbe11df11733838c5f684616 Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:53 2024 +0800 Subject: mm/filemap: return early if failed to allocate memory for split Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de60fd8ddeda2b41fbe11df11733838c5f684616 commit 6758c1128ceb45d1a35298912b974eb4895b7dd9 Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:56 2024 +0800 Subject: mm/filemap: optimize filemap folio adding Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6758c1128ceb45d1a35298912b974eb4895b7dd9 These all landed in 6.10-rc1. For Noble, we will use the versions from upstream -stable 6.6.y directly from Greg KH. They contain minor backports from the mainline variants, but cherry pick directly to 6.8 noble. [Testcase] You will need a disk attached to your VM, or a bare metal system with multiple disks. $ sudo lsblk $ sudo mkfs.xfs /dev/vdc $ echo >> xfsfallout.bash << EOF #!/bin/bash sudo mount /dev/vdc /mnt for x in {0..8}; do sudo fallocate -l100m /mnt/file${x}; sudo ./reader /mnt/file${x} & done EOF $ echo >> reader.c << EOF /* - * gcc -Wall -o reader reader.c -lpthread - */ + * gcc -Wall -o reader reader.c -lpthread + */ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/sendfile.h> #include <unistd.h> #include <errno.h> #include <err.h> #include <pthread.h> struct thread_data { - int fd; - size_t size; + int fd; + size_t size; }; static void *drop_pages(void *arg) { - struct thread_data *td = arg; - int ret; - unsigned long nr_pages = td->size / 4096; - unsigned int seed = 0x55443322; - off_t offset; - unsigned long nr_drops = 0; - - while (1) { - offset = rand_r(&seed) % nr_pages; - offset = offset * 4096; - ret = posix_fadvise(td->fd, offset, 4096, POSIX_FADV_DONTNEED); - if (ret < 0) - err(1, "fadvise dontneed"); - - /* every once and a while, drop everything */ - if (nr_drops > nr_pages / 2) { - ret = posix_fadvise(td->fd, 0, td->size, POSIX_FADV_DONTNEED); - if (ret < 0) - err(1, "fadvise dontneed"); - fprintf(stderr, "+"); - nr_drops = 0; - } - nr_drops++; - } - return NULL; + struct thread_data *td = arg; + int ret; + unsigned long nr_pages = td->size / 4096; + unsigned int seed = 0x55443322; + off_t offset; + unsigned long nr_drops = 0; + + while (1) { + offset = rand_r(&seed) % nr_pages; + offset = offset * 4096; + ret = posix_fadvise(td->fd, offset, 4096, POSIX_FADV_DONTNEED); + if (ret < 0) + err(1, "fadvise dontneed"); + + /* every once and a while, drop everything */ + if (nr_drops > nr_pages / 2) { + ret = posix_fadvise(td->fd, 0, td->size, POSIX_FADV_DONTNEED); + if (ret < 0) + err(1, "fadvise dontneed"); + fprintf(stderr, "+"); + nr_drops = 0; + } + nr_drops++; + } + return NULL; } #define READ_BUF (2 * 1024 * 1024) static void *read_pages(void *arg) { - struct thread_data *td = arg; - char buf[READ_BUF]; - ssize_t ret; - loff_t offset; - - while (1) { - offset = 0; - while(offset < td->size) { - ret = pread(td->fd, buf, READ_BUF, offset); - if (ret < 0) - err(1, "read"); - if (ret == 0) - break; - offset += ret; - } - } - return NULL; + struct thread_data *td = arg; + char buf[READ_BUF]; + ssize_t ret; + loff_t offset; + + while (1) { + offset = 0; + while(offset < td->size) { + ret = pread(td->fd, buf, READ_BUF, offset); + if (ret < 0) + err(1, "read"); + if (ret == 0) + break; + offset += ret; + } + } + return NULL; } int main(int ac, char **av) { - int fd; - int ret; - struct stat st; - struct thread_data td; - pthread_t drop_tid; - pthread_t drop2_tid; - pthread_t read_tid; - - if (ac != 2) - err(1, "usage: reader filename\n"); - - fd = open(av[1], O_RDONLY, 0600); - if (fd < 0) - err(1, "unable to open %s", av[1]); - - ret = fstat(fd, &st); - if (ret < 0) - err(1, "stat"); - - td.fd = fd; - td.size = st.st_size; - - ret = pthread_create(&drop_tid, NULL, drop_pages, &td); - if (ret) - err(1, "pthread_create"); - ret = pthread_create(&drop2_tid, NULL, drop_pages, &td); - if (ret) - err(1, "pthread_create"); - ret = pthread_create(&read_tid, NULL, read_pages, &td); - if (ret) - err(1, "pthread_create"); - - pthread_join(drop_tid, NULL); - pthread_join(drop2_tid, NULL); - pthread_join(read_tid, NULL); + int fd; + int ret; + struct stat st; + struct thread_data td; + pthread_t drop_tid; + pthread_t drop2_tid; + pthread_t read_tid; + + if (ac != 2) + err(1, "usage: reader filename\n"); + + fd = open(av[1], O_RDONLY, 0600); + if (fd < 0) + err(1, "unable to open %s", av[1]); + + ret = fstat(fd, &st); + if (ret < 0) + err(1, "stat"); + + td.fd = fd; + td.size = st.st_size; + + ret = pthread_create(&drop_tid, NULL, drop_pages, &td); + if (ret) + err(1, "pthread_create"); + ret = pthread_create(&drop2_tid, NULL, drop_pages, &td); + if (ret) + err(1, "pthread_create"); + ret = pthread_create(&read_tid, NULL, read_pages, &td); + if (ret) + err(1, "pthread_create"); + + pthread_join(drop_tid, NULL); + pthread_join(drop2_tid, NULL); + pthread_join(read_tid, NULL); } EOF $ sudo apt install build-essential $ gcc -Wall -o reader reader.c -lpthread $ ./xfsfallout.bash The kernel should hang in approximately 5 minutes or less. There is a test kernel available in the following ppas: https://launchpad.net/~mruffell/+archive/ubuntu/lp2085495-test If you install this, running the testcase should not hang the kernel, even running the testcase for hours on end. [Where problems could occur] [Other info] Very detailed upstream mailing list thread: https://lore.kernel.org/linux-mm/20240913-ortsausgang-baustart-1dae9a18254d@brauner/T/ ** Description changed: BugLink: https://bugs.launchpad.net/bugs/2085495 [Impact] A long running, and incredibly difficult to reproduce large folio issue leads to hung task timeouts in the xfs subsystem with the following stack trace: CPU: 0 PID: 226487 Comm: xfs_io Tainted: G L 6.5.0-41-generic #41~22.04.2-Ubuntu Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:xas_descend+0x25/0xd0 Code: 90 90 90 90 90 55 48 89 e5 41 56 41 55 49 89 fd 41 54 49 89 f4 53 48 83 ec 08 0f b6 0e 48 8b 5f 08 80 f9 3f 0f 87 5d 2f 07 00 <48> d3 eb 83 e3 3f 89 d8 48 83 c0 04 49 8b 44 c4 08 4d 89 65 18 48 RSP: 0018:ffffaf9b44927a68 EFLAGS: 00000293 RAX: ffff8d61568f36d2 RBX: 00000000000005c0 RCX: 0000000000000006 RDX: 0000000000000002 RSI: ffff8d61568f36d0 RDI: ffffaf9b44927b10 RBP: ffffaf9b44927a90 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8d6159120938 R11: 0000000000000000 R12: ffff8d61568f36d0 R13: ffffaf9b44927b10 R14: ffffaf9b44927e30 R15: ffffaf9b44927e08 FS: 00007bcf4ce2c840(0000) GS:ffff8d61be400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007bcf4ca80df8 CR3: 0000000008524005 CR4: 0000000000370ef0 Call Trace: <IRQ> ? show_regs+0x6d/0x80 ? watchdog_timer_fn+0x1d8/0x240 ? __pfx_watchdog_timer_fn+0x10/0x10 ? __hrtimer_run_queues+0x10f/0x2a0 ? kvm_clock_get_cycles+0x18/0x40 ? hrtimer_interrupt+0xf6/0x250 ? __sysvec_apic_timer_interrupt+0x5f/0x140 ? sysvec_apic_timer_interrupt+0x8d/0xd0 </IRQ> <TASK> ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 ? xas_descend+0x25/0xd0 xas_load+0x4c/0x60 __xas_next+0xa9/0x150 filemap_get_read_batch+0x1a3/0x2e0 filemap_get_pages+0xa9/0x3b0 ? touch_atime+0x44/0x1c0 filemap_read+0xe7/0x430 generic_file_read_iter+0xbb/0x110 ? down_read+0x12/0xc0 xfs_file_buffered_read+0x57/0xe0 [xfs] xfs_file_read_iter+0xb6/0x1c0 [xfs] ? security_file_permission+0x5f/0x70 vfs_read+0x20a/0x360 __x64_sys_pread64+0xa6/0xd0 x64_sys_call+0x1e01/0x20b0 do_syscall_64+0x55/0x90 ? do_syscall_64+0x61/0x90 ? syscall_exit_to_user_mode+0x37/0x60 ? do_syscall_64+0x61/0x90 entry_SYSCALL_64_after_hwframe+0x73/0xdd RIP: 0033:0x7bcf4cd1278f Code: 08 89 3c 24 48 89 4c 24 18 e8 7d e2 f7 ff 4c 8b 54 24 18 48 8b 54 24 10 41 89 c0 48 8b 74 24 08 8b 3c 24 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 04 24 e8 bd e2 f7 ff 48 8b RSP: 002b:00007fff220ed560 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007bcf4cd1278f RDX: 0000000000010000 RSI: 0000623b5c5f2000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000005c0000 R11: 0000000000000293 R12: 00000000005c0000 R13: 000000001fa40000 R14: 00000000005c0000 R15: 0000000000000000 </TASK> watchdog: BUG: soft lockup - CPU#1 stuck for 417s! [xfs_io:226486] The transaction never recovers, and the system must be force restarted. Doing this can lose data not yet written to disk. There is no workaround, other than to build your kernel disabling large folio support for xfs. [Fix] The below patches fix the issue by more-or-less calling xas_reset() after xas_split_alloc(), which ensures the folio pointer list doesn't get corrupted if a race condition occurs. commit a4864671ca0bf51c8e78242951741df52c06766f Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:55 2024 +0800 Subject: lib/xarray: introduce a new helper xas_get_order Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a4864671ca0bf51c8e78242951741df52c06766f commit de60fd8ddeda2b41fbe11df11733838c5f684616 Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:53 2024 +0800 Subject: mm/filemap: return early if failed to allocate memory for split Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de60fd8ddeda2b41fbe11df11733838c5f684616 commit 6758c1128ceb45d1a35298912b974eb4895b7dd9 Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:56 2024 +0800 Subject: mm/filemap: optimize filemap folio adding Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6758c1128ceb45d1a35298912b974eb4895b7dd9 These all landed in 6.10-rc1. For Noble, we will use the versions from upstream -stable 6.6.y directly from Greg KH. They contain minor backports from the mainline variants, but cherry pick directly to 6.8 noble. + Only 6.1 or later is affected, so only noble needs the patches. + [Testcase] You will need a disk attached to your VM, or a bare metal system with multiple disks. $ sudo lsblk $ sudo mkfs.xfs /dev/vdc $ echo >> xfsfallout.bash << EOF #!/bin/bash sudo mount /dev/vdc /mnt for x in {0..8}; do sudo fallocate -l100m /mnt/file${x}; sudo ./reader /mnt/file${x} & done EOF $ echo >> reader.c << EOF /* * gcc -Wall -o reader reader.c -lpthread */ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/sendfile.h> #include <unistd.h> #include <errno.h> #include <err.h> #include <pthread.h> struct thread_data { int fd; size_t size; }; static void *drop_pages(void *arg) { struct thread_data *td = arg; int ret; unsigned long nr_pages = td->size / 4096; unsigned int seed = 0x55443322; off_t offset; unsigned long nr_drops = 0; while (1) { offset = rand_r(&seed) % nr_pages; offset = offset * 4096; ret = posix_fadvise(td->fd, offset, 4096, POSIX_FADV_DONTNEED); if (ret < 0) err(1, "fadvise dontneed"); /* every once and a while, drop everything */ if (nr_drops > nr_pages / 2) { ret = posix_fadvise(td->fd, 0, td->size, POSIX_FADV_DONTNEED); if (ret < 0) err(1, "fadvise dontneed"); fprintf(stderr, "+"); nr_drops = 0; } nr_drops++; } return NULL; } #define READ_BUF (2 * 1024 * 1024) static void *read_pages(void *arg) { struct thread_data *td = arg; char buf[READ_BUF]; ssize_t ret; loff_t offset; while (1) { offset = 0; while(offset < td->size) { ret = pread(td->fd, buf, READ_BUF, offset); if (ret < 0) err(1, "read"); if (ret == 0) break; offset += ret; } } return NULL; } int main(int ac, char **av) { int fd; int ret; struct stat st; struct thread_data td; pthread_t drop_tid; pthread_t drop2_tid; pthread_t read_tid; if (ac != 2) err(1, "usage: reader filename\n"); fd = open(av[1], O_RDONLY, 0600); if (fd < 0) err(1, "unable to open %s", av[1]); ret = fstat(fd, &st); if (ret < 0) err(1, "stat"); td.fd = fd; td.size = st.st_size; ret = pthread_create(&drop_tid, NULL, drop_pages, &td); if (ret) err(1, "pthread_create"); ret = pthread_create(&drop2_tid, NULL, drop_pages, &td); if (ret) err(1, "pthread_create"); ret = pthread_create(&read_tid, NULL, read_pages, &td); if (ret) err(1, "pthread_create"); pthread_join(drop_tid, NULL); pthread_join(drop2_tid, NULL); pthread_join(read_tid, NULL); } EOF $ sudo apt install build-essential $ gcc -Wall -o reader reader.c -lpthread $ ./xfsfallout.bash The kernel should hang in approximately 5 minutes or less. There is a test kernel available in the following ppas: https://launchpad.net/~mruffell/+archive/ubuntu/lp2085495-test If you install this, running the testcase should not hang the kernel, even running the testcase for hours on end. [Where problems could occur] [Other info] Very detailed upstream mailing list thread: https://lore.kernel.org/linux-mm/20240913-ortsausgang-baustart-1dae9a18254d@brauner/T/ -- You received this bug notification because you are a member of Kernel Packages, which is subscribed to linux in Ubuntu. https://bugs.launchpad.net/bugs/2085495 Title: mm/folios: xfs hangs with hung task timeouts with corrupted folio pointer lists Status in linux package in Ubuntu: Fix Released Status in linux source package in Noble: In Progress Bug description: BugLink: https://bugs.launchpad.net/bugs/2085495 [Impact] A long running, and incredibly difficult to reproduce large folio issue leads to hung task timeouts in the xfs subsystem with the following stack trace: CPU: 0 PID: 226487 Comm: xfs_io Tainted: G L 6.5.0-41-generic #41~22.04.2-Ubuntu Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:xas_descend+0x25/0xd0 Code: 90 90 90 90 90 55 48 89 e5 41 56 41 55 49 89 fd 41 54 49 89 f4 53 48 83 ec 08 0f b6 0e 48 8b 5f 08 80 f9 3f 0f 87 5d 2f 07 00 <48> d3 eb 83 e3 3f 89 d8 48 83 c0 04 49 8b 44 c4 08 4d 89 65 18 48 RSP: 0018:ffffaf9b44927a68 EFLAGS: 00000293 RAX: ffff8d61568f36d2 RBX: 00000000000005c0 RCX: 0000000000000006 RDX: 0000000000000002 RSI: ffff8d61568f36d0 RDI: ffffaf9b44927b10 RBP: ffffaf9b44927a90 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8d6159120938 R11: 0000000000000000 R12: ffff8d61568f36d0 R13: ffffaf9b44927b10 R14: ffffaf9b44927e30 R15: ffffaf9b44927e08 FS: 00007bcf4ce2c840(0000) GS:ffff8d61be400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007bcf4ca80df8 CR3: 0000000008524005 CR4: 0000000000370ef0 Call Trace: <IRQ> ? show_regs+0x6d/0x80 ? watchdog_timer_fn+0x1d8/0x240 ? __pfx_watchdog_timer_fn+0x10/0x10 ? __hrtimer_run_queues+0x10f/0x2a0 ? kvm_clock_get_cycles+0x18/0x40 ? hrtimer_interrupt+0xf6/0x250 ? __sysvec_apic_timer_interrupt+0x5f/0x140 ? sysvec_apic_timer_interrupt+0x8d/0xd0 </IRQ> <TASK> ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 ? xas_descend+0x25/0xd0 xas_load+0x4c/0x60 __xas_next+0xa9/0x150 filemap_get_read_batch+0x1a3/0x2e0 filemap_get_pages+0xa9/0x3b0 ? touch_atime+0x44/0x1c0 filemap_read+0xe7/0x430 generic_file_read_iter+0xbb/0x110 ? down_read+0x12/0xc0 xfs_file_buffered_read+0x57/0xe0 [xfs] xfs_file_read_iter+0xb6/0x1c0 [xfs] ? security_file_permission+0x5f/0x70 vfs_read+0x20a/0x360 __x64_sys_pread64+0xa6/0xd0 x64_sys_call+0x1e01/0x20b0 do_syscall_64+0x55/0x90 ? do_syscall_64+0x61/0x90 ? syscall_exit_to_user_mode+0x37/0x60 ? do_syscall_64+0x61/0x90 entry_SYSCALL_64_after_hwframe+0x73/0xdd RIP: 0033:0x7bcf4cd1278f Code: 08 89 3c 24 48 89 4c 24 18 e8 7d e2 f7 ff 4c 8b 54 24 18 48 8b 54 24 10 41 89 c0 48 8b 74 24 08 8b 3c 24 b8 11 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 04 24 e8 bd e2 f7 ff 48 8b RSP: 002b:00007fff220ed560 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007bcf4cd1278f RDX: 0000000000010000 RSI: 0000623b5c5f2000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000005c0000 R11: 0000000000000293 R12: 00000000005c0000 R13: 000000001fa40000 R14: 00000000005c0000 R15: 0000000000000000 </TASK> watchdog: BUG: soft lockup - CPU#1 stuck for 417s! [xfs_io:226486] The transaction never recovers, and the system must be force restarted. Doing this can lose data not yet written to disk. There is no workaround, other than to build your kernel disabling large folio support for xfs. [Fix] The below patches fix the issue by more-or-less calling xas_reset() after xas_split_alloc(), which ensures the folio pointer list doesn't get corrupted if a race condition occurs. commit a4864671ca0bf51c8e78242951741df52c06766f Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:55 2024 +0800 Subject: lib/xarray: introduce a new helper xas_get_order Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a4864671ca0bf51c8e78242951741df52c06766f commit de60fd8ddeda2b41fbe11df11733838c5f684616 Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:53 2024 +0800 Subject: mm/filemap: return early if failed to allocate memory for split Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de60fd8ddeda2b41fbe11df11733838c5f684616 commit 6758c1128ceb45d1a35298912b974eb4895b7dd9 Author: Kairui Song <kas...@tencent.com> Date: Tue Apr 16 01:18:56 2024 +0800 Subject: mm/filemap: optimize filemap folio adding Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6758c1128ceb45d1a35298912b974eb4895b7dd9 These all landed in 6.10-rc1. For Noble, we will use the versions from upstream -stable 6.6.y directly from Greg KH. They contain minor backports from the mainline variants, but cherry pick directly to 6.8 noble. Only 6.1 or later is affected, so only noble needs the patches. [Testcase] You will need a disk attached to your VM, or a bare metal system with multiple disks. $ sudo lsblk $ sudo mkfs.xfs /dev/vdc $ cat >> xfsfallout.bash << EOF #!/bin/bash sudo mount /dev/vdc /mnt for x in {0..8}; do sudo fallocate -l100m /mnt/file${x}; sudo ./reader /mnt/file${x} & done EOF $ cat >> reader.c << EOF /* * gcc -Wall -o reader reader.c -lpthread */ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/sendfile.h> #include <unistd.h> #include <errno.h> #include <err.h> #include <pthread.h> struct thread_data { int fd; size_t size; }; static void *drop_pages(void *arg) { struct thread_data *td = arg; int ret; unsigned long nr_pages = td->size / 4096; unsigned int seed = 0x55443322; off_t offset; unsigned long nr_drops = 0; while (1) { offset = rand_r(&seed) % nr_pages; offset = offset * 4096; ret = posix_fadvise(td->fd, offset, 4096, POSIX_FADV_DONTNEED); if (ret < 0) err(1, "fadvise dontneed"); /* every once and a while, drop everything */ if (nr_drops > nr_pages / 2) { ret = posix_fadvise(td->fd, 0, td->size, POSIX_FADV_DONTNEED); if (ret < 0) err(1, "fadvise dontneed"); fprintf(stderr, "+"); nr_drops = 0; } nr_drops++; } return NULL; } #define READ_BUF (2 * 1024 * 1024) static void *read_pages(void *arg) { struct thread_data *td = arg; char buf[READ_BUF]; ssize_t ret; loff_t offset; while (1) { offset = 0; while(offset < td->size) { ret = pread(td->fd, buf, READ_BUF, offset); if (ret < 0) err(1, "read"); if (ret == 0) break; offset += ret; } } return NULL; } int main(int ac, char **av) { int fd; int ret; struct stat st; struct thread_data td; pthread_t drop_tid; pthread_t drop2_tid; pthread_t read_tid; if (ac != 2) err(1, "usage: reader filename\n"); fd = open(av[1], O_RDONLY, 0600); if (fd < 0) err(1, "unable to open %s", av[1]); ret = fstat(fd, &st); if (ret < 0) err(1, "stat"); td.fd = fd; td.size = st.st_size; ret = pthread_create(&drop_tid, NULL, drop_pages, &td); if (ret) err(1, "pthread_create"); ret = pthread_create(&drop2_tid, NULL, drop_pages, &td); if (ret) err(1, "pthread_create"); ret = pthread_create(&read_tid, NULL, read_pages, &td); if (ret) err(1, "pthread_create"); pthread_join(drop_tid, NULL); pthread_join(drop2_tid, NULL); pthread_join(read_tid, NULL); } EOF $ sudo apt install build-essential $ gcc -Wall -o reader reader.c -lpthread $ ./xfsfallout.bash The kernel should hang in approximately 5 minutes or less. There is a test kernel available in the following ppas: https://launchpad.net/~mruffell/+archive/ubuntu/lp2085495-test If you install this, running the testcase should not hang the kernel, even running the testcase for hours on end. [Where problems could occur] [Other info] Very detailed upstream mailing list thread: https://lore.kernel.org/linux-mm/20240913-ortsausgang-baustart-1dae9a18254d@brauner/T/ To manage notifications about this bug go to: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2085495/+subscriptions -- Mailing list: https://launchpad.net/~kernel-packages Post to : kernel-packages@lists.launchpad.net Unsubscribe : https://launchpad.net/~kernel-packages More help : https://help.launchpad.net/ListHelp