On Mon, 2007-07-23 at 12:04 +0200, Jörn Engel wrote: > I believe this whole thing is fundamentally flawed. The perfect > readahead size depends on the device in question. If we set a single > system-wide value depending on memory size, it may easily be too small > and too large at the same time. Think hard disk and SSD.
Well, I think the filesystem should get the first shot at altering the readahead heuristic (we should add a hook so it can then ask the underlying device). Something like: === Hook for filesystems to tweak readahead logic It was suggested at Fengguang's OLS talk that filesystems can make a useful contribution to the readahead logic: avoiding seeks and lock boundaries (on cluster filesystems) and other such filesystem-specific things. The alter_readahead hook should return the number of pages to read. Signed-off-by: Rusty Russell <[EMAIL PROTECTED]> Signed-off-by: Fengguang Wu <[EMAIL PROTECTED]> --- Documentation/filesystems/vfs.txt | 10 ++++++---- include/linux/fs.h | 1 + mm/readahead.c | 12 ++++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) --- linux-2.6.22-rc6-mm1.orig/include/linux/fs.h +++ linux-2.6.22-rc6-mm1/include/linux/fs.h @@ -1204,7 +1204,8 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*revoke)(struct file *, struct address_space *); + unsigned long (*alter_readahead)(struct file *, pgoff_t, unsigned long, pgoff_t, unsigned long); }; struct inode_operations { --- linux-2.6.22-rc6-mm1.orig/mm/readahead.c +++ linux-2.6.22-rc6-mm1/mm/readahead.c @@ -394,7 +394,19 @@ ondemand_readahead(struct address_space } readit: + /* + * Give filesystem a chance to tweak readahead size (eg. to + * contiguous block boundary). It should probably not change + * by more than 50% otherwise it's really ignoring our + * readahead advice. + */ + if (filp->f_op->alter_readahead) { + ra->size = filp->f_op->alter_readahead(filp, offset, req_size, + ra->start, ra->size); + if (ra->async_size > ra->size) + ra->async_size = ra->size; + } return ra_submit(ra, mapping, filp); } --- linux-2.6.22-rc6-mm1.orig/Documentation/filesystems/vfs.txt +++ linux-2.6.22-rc6-mm1/Documentation/filesystems/vfs.txt @@ -777,11 +777,10 @@ struct file_operations { int (*check_flags)(int); int (*dir_notify)(struct file *filp, unsigned long arg); int (*flock) (struct file *, int, struct file_lock *); - ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned -int); - ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned -int); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); int (*revoke)(struct file *); + unsigned long (*alter_readahead)(struct file *, pgoff_t, unsigned long, pgoff_t, unsigned long); }; Again, all methods are called without any locks being held, unless @@ -859,6 +858,9 @@ otherwise noted. to an open file. This method must ensure that all currently blocked writes are flushed and reads will fail. + alter_readahead: called when the readahead logic is about to submit a + readahead request for I/O + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/