The IOCB_DONTCACHE writeback path in generic_write_sync() calls
filemap_flush_range() on every write, submitting writeback inline in
the writer's context.  Perf lock contention profiling shows the
performance problem is not lock contention but the writeback submission
work itself — walking the page tree and submitting I/O blocks the writer
for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
(dontcache).

Replace the inline filemap_flush_range() call with a flusher kick that
drains dirty pages in the background.  This moves writeback submission
completely off the writer's hot path.

To avoid flushing unrelated buffered dirty data, add a dedicated
WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
the new NR_DONTCACHE_DIRTY counter to determine how many pages to write
back.  The flusher writes back that many pages from the oldest dirty
inodes (not restricted to dontcache-specific inodes). This helps
preserve I/O batching while limiting the scope of expedited writeback.

Like WB_start_all, the WB_start_dontcache bit coalesces multiple
DONTCACHE writes into a single flusher wakeup without per-write
allocations.

Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
visibility, and target the correct cgroup writeback domain via
unlocked_inode_to_wb_begin().

dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
~503 GB, compared to a v6.19-ish baseline):

  Single-client sequential write (MB/s):
                       baseline    patched     change
  buffered              1449.8     1440.1      -0.7%
  dontcache             1347.9     1461.5      +8.4%
  direct                1450.0     1440.1      -0.7%

  Single-client sequential write latency (us):
                       baseline    patched     change
  dontcache p50         3031.0    10551.3    +248.1%
  dontcache p99        74973.2    21626.9     -71.2%
  dontcache p99.9      85459.0    23199.7     -72.9%

  Single-client random write (MB/s):
                       baseline    patched     change
  dontcache              284.2      295.4      +3.9%

  Single-client random write p99.9 latency (us):
                       baseline    patched     change
  dontcache             2277.4      872.4     -61.7%

  Multi-writer aggregate throughput (MB/s):
                       baseline    patched     change
  buffered              1619.5     1611.2      -0.5%
  dontcache             1281.1     1629.4     +27.2%
  direct                1545.4     1609.4      +4.1%

  Mixed-mode noisy neighbor (dontcache writer + buffered readers):
                       baseline    patched     change
  writer (MB/s)         1297.6     1471.1     +13.4%
  readers avg (MB/s)     855.0      462.4     -45.9%

nfsd-io-bench results on same hardware (XFS on NVMe, NFSv3 via fio
NFS engine with libnfs, 1024 NFSD threads, pool_mode=pernode,
file size ~502 GB, compared to v6.19-ish baseline):

  Single-client sequential write (MB/s):
                       baseline    patched     change
  buffered              4844.2     4653.4      -3.9%
  dontcache             3028.3     3723.1     +22.9%
  direct                 957.6      987.8      +3.2%

  Single-client sequential write p99.9 latency (us):
                       baseline    patched     change
  dontcache            759169.0   175112.2     -76.9%

  Single-client random write (MB/s):
                       baseline    patched     change
  dontcache              590.0     1561.0    +164.6%

  Multi-writer aggregate throughput (MB/s):
                       baseline    patched     change
  buffered              9636.3     9422.9      -2.2%
  dontcache             1894.9     9442.6    +398.3%
  direct                 809.6      975.1     +20.4%

  Noisy neighbor (dontcache writer + random readers):
                       baseline    patched     change
  writer (MB/s)         1854.5     4063.6    +119.1%
  readers avg (MB/s)     131.2      101.6     -22.5%

The NFS results show even larger improvements than the local benchmarks.
Multi-writer dontcache throughput improves nearly 5x, matching buffered
I/O. Dirty page footprint drops 85-95% in sequential workloads vs.
buffered.

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <[email protected]>
---
 fs/fs-writeback.c                | 60 ++++++++++++++++++++++++++++++++++++++++
 include/linux/backing-dev-defs.h |  2 ++
 include/linux/fs.h               |  6 ++--
 include/trace/events/writeback.h |  3 +-
 4 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..377767db48f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1334,6 +1334,18 @@ static void wb_start_writeback(struct bdi_writeback *wb, 
enum wb_reason reason)
        wb_wakeup(wb);
 }
 
+static void wb_start_dontcache_writeback(struct bdi_writeback *wb)
+{
+       if (!wb_has_dirty_io(wb))
+               return;
+
+       if (test_bit(WB_start_dontcache, &wb->state) ||
+           test_and_set_bit(WB_start_dontcache, &wb->state))
+               return;
+
+       wb_wakeup(wb);
+}
+
 /**
  * wb_start_background_writeback - start background writeback
  * @wb: bdi_writback to write from
@@ -2373,6 +2385,28 @@ static long wb_check_start_all(struct bdi_writeback *wb)
        return nr_pages;
 }
 
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+       long nr_pages;
+
+       if (!test_bit(WB_start_dontcache, &wb->state))
+               return 0;
+
+       nr_pages = global_node_page_state(NR_DONTCACHE_DIRTY);
+       if (nr_pages) {
+               struct wb_writeback_work work = {
+                       .nr_pages       = wb_split_bdi_pages(wb, nr_pages),
+                       .sync_mode      = WB_SYNC_NONE,
+                       .range_cyclic   = 1,
+                       .reason         = WB_REASON_DONTCACHE,
+               };
+
+               nr_pages = wb_writeback(wb, &work);
+       }
+
+       clear_bit(WB_start_dontcache, &wb->state);
+       return nr_pages;
+}
 
 /*
  * Retrieve work items and do the writeback they describe
@@ -2394,6 +2428,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
         */
        wrote += wb_check_start_all(wb);
 
+       /*
+        * Check for dontcache writeback request
+        */
+       wrote += wb_check_start_dontcache(wb);
+
        /*
         * Check for periodic writeback, kupdated() style
         */
@@ -2468,6 +2507,27 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info 
*bdi,
        rcu_read_unlock();
 }
 
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping:   address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache
+ * dirty pages.  Uses a dedicated WB_start_dontcache bit so that only
+ * pages tracked by NR_DONTCACHE_DIRTY are written back, rather than
+ * flushing the entire BDI's dirty pages.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+       struct inode *inode = mapping->host;
+       struct bdi_writeback *wb;
+       struct wb_lock_cookie cookie = {};
+
+       wb = unlocked_inode_to_wb_begin(inode, &cookie);
+       wb_start_dontcache_writeback(wb);
+       unlocked_inode_to_wb_end(inode, &cookie);
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
 /*
  * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a06b93446d10..74f8a9977f5d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -26,6 +26,7 @@ enum wb_state {
        WB_writeback_running,   /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
        WB_start_all,           /* nr_pages == 0 (all) work pending */
+       WB_start_dontcache,     /* dontcache writeback pending */
 };
 
 enum wb_stat_item {
@@ -55,6 +56,7 @@ enum wb_reason {
         */
        WB_REASON_FORKER_THREAD,
        WB_REASON_FOREIGN_FLUSH,
+       WB_REASON_DONTCACHE,
 
        WB_REASON_MAX,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..df72b42a9e9b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2624,6 +2624,7 @@ extern int __must_check file_write_and_wait_range(struct 
file *file,
                                                loff_t start, loff_t end);
 int filemap_flush_range(struct address_space *mapping, loff_t start,
                loff_t end);
+void filemap_dontcache_kick_writeback(struct address_space *mapping);
 
 static inline int file_write_and_wait(struct file *file)
 {
@@ -2657,10 +2658,7 @@ static inline ssize_t generic_write_sync(struct kiocb 
*iocb, ssize_t count)
                if (ret)
                        return ret;
        } else if (iocb->ki_flags & IOCB_DONTCACHE) {
-               struct address_space *mapping = iocb->ki_filp->f_mapping;
-
-               filemap_flush_range(mapping, iocb->ki_pos - count,
-                               iocb->ki_pos - 1);
+               filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping);
        }
 
        return count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index bdac0d685a98..13ee076ccd16 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -44,7 +44,8 @@
        EM( WB_REASON_PERIODIC,                 "periodic")             \
        EM( WB_REASON_FS_FREE_SPACE,            "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,            "forker_thread")        \
-       EMe(WB_REASON_FOREIGN_FLUSH,            "foreign_flush")
+       EM( WB_REASON_FOREIGN_FLUSH,            "foreign_flush")        \
+       EMe(WB_REASON_DONTCACHE,                "dontcache")
 
 WB_WORK_REASON
 

-- 
2.53.0


Reply via email to