On 08/12/2016 09:16 AM, Oleg Nesterov wrote:
> Please drop two patches I sent before and try the new one below.

Hello Oleg,

Thanks for the patch. In addition to your patch I also applied the
attached two patches before I started testing. It took some time
before I could reproduce the hang in truncate_inode_pages_range().
To my surprise the following appeared in the system log instead of
a list of waiting tasks when I succeeded to reproduce this hang:

Aug 12 14:48:06 ion-dev-ib-ini systemd-udevd[500]: seq 11210 
'/devices/virtual/block/dm-0' is taking a long time
Aug 12 14:48:07 ion-dev-ib-ini systemd-udevd[500]: seq 11227 
'/devices/virtual/block/dm-1' is taking a long time
Aug 12 14:50:06 ion-dev-ib-ini systemd-udevd[500]: seq 11210 
'/devices/virtual/block/dm-0' killed
Aug 12 14:50:06 ion-dev-ib-ini kernel: do_generic_file_read / pid 17232: killed
Aug 12 14:50:06 ion-dev-ib-ini systemd[1]: Started Cleanup of Temporary 
Directories.
Aug 12 14:50:36 ion-dev-ib-ini kernel: __lock_page_impl / pid 17224 / m 0x2: 
timeout - continuing to wait for 17224
Aug 12 14:50:36 ion-dev-ib-ini kernel: __lock_page_impl / pid 17232 / m 0x2: 
timeout - continuing to wait for 17232
Aug 12 14:51:06 ion-dev-ib-ini kernel: __lock_page_impl / pid 17224 / m 0x2: 
timeout - continuing to wait for 17224
Aug 12 14:51:06 ion-dev-ib-ini kernel: __lock_page_impl / pid 17232 / m 0x2: 
timeout - continuing to wait for 17232
[ ... ]

Running echo w > /proc/sysrq-trigger learned me that both pid 17224 and
17232 were hanging in truncate_inode_pages_range(). Does this mean that
some code in mm or in the filesystem I was using for this test (ext4) does
not unlock all pages it should unlock if a fatal signal is received?

Please let me know if you would like me to repost this message on an
mm-related mailing list.

Thanks,

Bart.

The echo w > /proc/sysrq-trigger output:

sysrq: SysRq : Show Blocked State
  task                        PC stack   pid father
systemd-udevd   D ffff88039870b7e8     0 17224    500 0x00000006
Call Trace:
 [<ffffffff816219f7>] schedule+0x37/0x90
 [<ffffffff81626019>] schedule_timeout+0x249/0x470
 [<ffffffff81620dcf>] io_schedule_timeout+0x9f/0x110
 [<ffffffff81622204>] bit_wait_io_timeout+0x24/0x70
 [<ffffffff81621f89>] __wait_on_bit_lock+0x49/0xa0
 [<ffffffff81152be5>] __lock_page_impl+0xe5/0x160
 [<ffffffff81152c6e>] __lock_page+0xe/0x10
 [<ffffffff811666a6>] truncate_inode_pages_range+0x416/0x7c0
 [<ffffffff81166a60>] truncate_inode_pages+0x10/0x20
 [<ffffffff81214200>] kill_bdev+0x30/0x40
 [<ffffffff81215521>] __blkdev_put+0x71/0x360
 [<ffffffff81215859>] blkdev_put+0x49/0x170
 [<ffffffff812159a0>] blkdev_close+0x20/0x30
 [<ffffffff811d6058>] __fput+0xe8/0x1f0
 [<ffffffff811d6199>] ____fput+0x9/0x10
 [<ffffffff81084453>] task_work_run+0x83/0xb0
 [<ffffffff810661ee>] do_exit+0x3ee/0xc40
 [<ffffffff81066acb>] do_group_exit+0x4b/0xc0
 [<ffffffff81073f1a>] get_signal+0x2ca/0x940
 [<ffffffff8101bf43>] do_signal+0x23/0x660
 [<ffffffff810022b3>] exit_to_usermode_loop+0x73/0xb0
 [<ffffffff81002cb0>] syscall_return_slowpath+0xb0/0xc0
 [<ffffffff816274b3>] entry_SYSCALL_64_fastpath+0xa6/0xa8
systemd-udevd   D ffff88006ce6f7e8     0 17232    500 0x00000006
Call Trace:
 [<ffffffff816219f7>] schedule+0x37/0x90
 [<ffffffff81626019>] schedule_timeout+0x249/0x470
 [<ffffffff81620dcf>] io_schedule_timeout+0x9f/0x110
 [<ffffffff81622204>] bit_wait_io_timeout+0x24/0x70
 [<ffffffff81621f89>] __wait_on_bit_lock+0x49/0xa0
 [<ffffffff81152be5>] __lock_page_impl+0xe5/0x160
 [<ffffffff81152c6e>] __lock_page+0xe/0x10
 [<ffffffff811666a6>] truncate_inode_pages_range+0x416/0x7c0
 [<ffffffff81166a60>] truncate_inode_pages+0x10/0x20
 [<ffffffff81214200>] kill_bdev+0x30/0x40
 [<ffffffff81215521>] __blkdev_put+0x71/0x360
 [<ffffffff81215859>] blkdev_put+0x49/0x170
 [<ffffffff812159a0>] blkdev_close+0x20/0x30
 [<ffffffff811d6058>] __fput+0xe8/0x1f0
 [<ffffffff811d6199>] ____fput+0x9/0x10
 [<ffffffff81084453>] task_work_run+0x83/0xb0
 [<ffffffff810661ee>] do_exit+0x3ee/0xc40
 [<ffffffff81066acb>] do_group_exit+0x4b/0xc0
 [<ffffffff81073f1a>] get_signal+0x2ca/0x940
 [<ffffffff8101bf43>] do_signal+0x23/0x660
 [<ffffffff810022b3>] exit_to_usermode_loop+0x73/0xb0
 [<ffffffff81002cb0>] syscall_return_slowpath+0xb0/0xc0
 [<ffffffff816274b3>] entry_SYSCALL_64_fastpath+0xa6/0xa8

>From af1cda43467c7fe2a6c76b11a6c25fcbec424ce3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanass...@sandisk.com>
Date: Thu, 11 Aug 2016 16:38:32 -0700
Subject: [PATCH] mm: __lock_page() dbg

---
 include/linux/mm_types.h |  3 +++
 include/linux/pagemap.h  | 22 ++++++++++++++++++++--
 mm/filemap.c             | 44 ++++++++++++++++++++++++++++++++------------
 mm/ksm.c                 |  1 +
 mm/migrate.c             |  1 +
 mm/shmem.c               |  1 +
 mm/swap_state.c          |  2 ++
 mm/vmscan.c              |  1 +
 8 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ca3e517..59fdfeb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
 
 struct address_space;
 struct mem_cgroup;
+struct task_struct;
 
 #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
 #define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
@@ -220,6 +221,8 @@ struct page {
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
+
+	struct task_struct *owner;
 }
 /*
  * The struct page can be forced to be double word aligned so that atomic ops
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9735410..d332674 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -419,10 +419,25 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
 extern void unlock_page(struct page *page);
 
+static inline struct task_struct *get_page_lock_owner(struct page *page)
+{
+	return page->owner;
+}
+
+static inline void set_page_lock_owner(struct page *page, struct task_struct *t)
+{
+	page->owner = t;
+}
+
 static inline int trylock_page(struct page *page)
 {
+	int res;
+
 	page = compound_head(page);
-	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
+	res = !test_and_set_bit_lock(PG_locked, &page->flags);
+	if (likely(res))
+		set_page_lock_owner(page, current);
+	return res;
 }
 
 /*
@@ -641,9 +656,12 @@ static inline int add_to_page_cache(struct page *page,
 	int error;
 
 	__SetPageLocked(page);
+	set_page_lock_owner(page, current);
 	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
-	if (unlikely(error))
+	if (unlikely(error)) {
+		set_page_lock_owner(page, NULL);
 		__ClearPageLocked(page);
+	}
 	return error;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 530e75a..0ad8bf6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -699,11 +699,13 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 	int ret;
 
 	__SetPageLocked(page);
+	set_page_lock_owner(page, current);
 	ret = __add_to_page_cache_locked(page, mapping, offset,
 					 gfp_mask, &shadow);
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		set_page_lock_owner(page, NULL);
 		__ClearPageLocked(page);
-	else {
+	} else {
 		/*
 		 * The page might have been evicted from cache only
 		 * recently, in which case it should be activated like
@@ -831,6 +833,7 @@ void unlock_page(struct page *page)
 {
 	page = compound_head(page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	set_page_lock_owner(page, NULL);
 	clear_bit_unlock(PG_locked, &page->flags);
 	smp_mb__after_atomic();
 	wake_up_page(page, PG_locked);
@@ -925,27 +928,44 @@ void page_endio(struct page *page, int rw, int err)
 }
 EXPORT_SYMBOL_GPL(page_endio);
 
+int __lock_page_impl(struct page *page, int mode)
+{
+	struct page *page_head = compound_head(page);
+	DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
+	struct task_struct *owner;
+	int res;
+
+	for (;;) {
+		wait.key.timeout = jiffies + 30 * HZ;
+		res = __wait_on_bit_lock(page_waitqueue(page_head),
+					 &wait, bit_wait_io_timeout, mode);
+		if (res == 0) {
+			set_page_lock_owner(page, current);
+			break;
+		}
+		if (res == -EINTR)
+			break;
+		owner = get_page_lock_owner(page);
+		pr_info("%s / pid %d / m %#x: %s - continuing to wait for %d\n",
+			__func__, task_pid_nr(current), mode, res == -EAGAIN ?
+			"timeout" : "interrupted",
+			owner ? task_pid_nr(owner) : 0);
+	}
+	return res;
+}
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
  */
 void __lock_page(struct page *page)
 {
-	struct page *page_head = compound_head(page);
-	DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
-	__wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
-							TASK_UNINTERRUPTIBLE);
+	__lock_page_impl(page, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
 
 int __lock_page_killable(struct page *page)
 {
-	struct page *page_head = compound_head(page);
-	DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
-	return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
-					bit_wait_io, TASK_KILLABLE);
+	return __lock_page_impl(page, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 4786b41..20ca878 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1880,6 +1880,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 		SetPageDirty(new_page);
 		__SetPageUptodate(new_page);
 		__SetPageLocked(new_page);
+		set_page_lock_owner(new_page, current);
 	}
 
 	return new_page;
diff --git a/mm/migrate.c b/mm/migrate.c
index bd3fdc2..50e5bc1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1794,6 +1794,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 	/* Prepare a page as a migration target */
 	__SetPageLocked(new_page);
+	set_page_lock_owner(new_page, current);
 	__SetPageSwapBacked(new_page);
 
 	/* anon mapping, we can simply copy page->mapping to the new page: */
diff --git a/mm/shmem.c b/mm/shmem.c
index 171dee7..0af6bf7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1021,6 +1021,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
 	if (page) {
 		__SetPageLocked(page);
+		set_page_lock_owner(page, current);
 		__SetPageSwapBacked(page);
 	}
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c99463a..8522a8c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -361,6 +361,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
 		__SetPageLocked(new_page);
+		set_page_lock_owner(new_page, current);
 		__SetPageSwapBacked(new_page);
 		err = __add_to_swap_cache(new_page, entry);
 		if (likely(!err)) {
@@ -373,6 +374,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			return new_page;
 		}
 		radix_tree_preload_end();
+		set_page_lock_owner(new_page, NULL);
 		__ClearPageLocked(new_page);
 		/*
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c4a2f45..67d7496 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1190,6 +1190,7 @@ lazyfree:
 		 * we obviously don't have to worry about waking up a process
 		 * waiting on the page lock, because there are no references.
 		 */
+		set_page_lock_owner(page, NULL);
 		__ClearPageLocked(page);
 free_it:
 		if (ret == SWAP_LZFREE)
-- 
2.9.2

>From 32f250e0c8aa3d90f7fc8ac293060e2944d359a5 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanass...@sandisk.com>
Date: Thu, 11 Aug 2016 11:02:29 -0700
Subject: [PATCH] do_generic_file_read(): Fail immediately if killed

If a fatal signal has been received, fail immediately instead of
trying to read more data.
---
 mm/filemap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 20f3b1f..6e46fb5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1643,7 +1643,12 @@ find_page:
 			 * wait_on_page_locked is used to avoid unnecessarily
 			 * serialisations and why it's safe.
 			 */
-			wait_on_page_locked_killable(page);
+			error = wait_on_page_locked_killable(page);
+			if (error == -EINTR) {
+				put_page(page);
+				goto out;
+			}
+			error = 0;
 			if (PageUptodate(page))
 				goto page_ok;
 
-- 
2.9.2

Reply via email to