On Thursday 06 September 2007 03:41, Bernd Schubert wrote:

> > This comment block should be:
> >
> > /**
> >  * generic_file_buffered_write - handle an iov
> >  * @iocb:   file operations
> >  * @iov:    vector of data to write
> >  * @nr_segs:        number of iov segments
> >  * @pos:    position in the file
> >  * @ppos:   position in the file after this function
> >  * @count:  number of bytes to write
> >  * @written:        offset in iov->base (data to skip on write)
> >  *
> >  * This function will do 3 main tasks for each iov:
> >  * - prepare a write
> >  * - copy the data from iov into a new page
> >  * - commit this page
>
> Thanks, done.
>
> I also removed the FIXMEs and created a second patch.
>
> Signed-off-by: Bernd Schubert <[EMAIL PROTECTED]>
> Signed-off-by: Goswin von Brederlow <[EMAIL PROTECTED]>

Minor nit: when resubmitting a patch, you should include everything
(ie. the full changelog of problem statement and fix description) in a
single mail. It's just a bit easier...

So I believe the problem is that for a multi-segment iovec, we currently
prepare_write/commit_write once for each segment, right? We do this
because there is a nasty deadlock in the VM (copy_from_user being
called with a page locked), and copying multiple segs dramatically
increases the chances that one of these copies will cause a page fault
and thus potentially deadlock.

The fix you have I don't think can work because a filesystem must be
notified of the modification _before_ it has happened. (If I understand
correctly, you are skipping the prepare_write potentially until after
some data is copied?).

Anyway, there are fixes for this deadlock in Andrew's -mm tree, but
also a workaround for the NFSD problem in git commit 29dbb3fc. Did
you try a later kernel to see if it is fixed there?

Thanks,
Nick

>
>  mm/filemap.c |  142 +++++++++++++++++++++++++++++++++----------------
>  1 file changed, 96 insertions(+), 46 deletions(-)
>
> Index: linux-2.6.20.3/mm/filemap.c
> ===================================================================
> --- linux-2.6.20.3.orig/mm/filemap.c  2007-09-05 14:04:18.000000000 +0200
> +++ linux-2.6.20.3/mm/filemap.c       2007-09-05 18:50:26.000000000 +0200
> @@ -2057,6 +2057,21 @@
>  }
>  EXPORT_SYMBOL(generic_file_direct_write);
>
> +/**
> + * generic_file_buffered_write - handle iov'ectors
> + * @iob:     file operations
> + * @iov:     vector of data to write
> + * @nr_segs: number of iov segments
> + * @pos:     position in the file
> + * @ppos:    position in the file after this function
> + * @count:   number of bytes to write
> + * written:  offset in iov->base (data to skip on write)
> + *
> + * This function will do 3 main tasks for each iov:
> + * - prepare a write
> + * - copy the data from iov into a new page
> + * - commit this page
> + */
>  ssize_t
>  generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
>               unsigned long nr_segs, loff_t pos, loff_t *ppos,
> @@ -2074,6 +2089,11 @@
>       const struct iovec *cur_iov = iov; /* current iovec */
>       size_t          iov_base = 0;      /* offset in the current iovec */
>       char __user     *buf;
> +     unsigned long   data_start = (pos & (PAGE_CACHE_SIZE -1)); /* Within 
> page
> */ +  loff_t          wpos = pos; /* the position in the file we will return 
> */ +
> +     /* position in file as index of pages */
> +     unsigned long   index = pos >> PAGE_CACHE_SHIFT;
>
>       pagevec_init(&lru_pvec, 0);
>
> @@ -2087,9 +2107,15 @@
>               buf = cur_iov->iov_base + iov_base;
>       }
>
> +     page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
> +     if (!page) {
> +             status = -ENOMEM;
> +             goto out;
> +     }
> +
>       do {
> -             unsigned long index;
>               unsigned long offset;
> +             unsigned long data_end; /* end of data within the page */
>               size_t copied;
>
>               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
> @@ -2106,6 +2132,8 @@
>                */
>               bytes = min(bytes, cur_iov->iov_len - iov_base);
>
> +             data_end = offset + bytes;
> +
>               /*
>                * Bring in the user page that we will copy from _first_.
>                * Otherwise there's a nasty deadlock on copying from the
> @@ -2114,34 +2142,30 @@
>                */
>               fault_in_pages_readable(buf, bytes);
>
> -             page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> -             if (!page) {
> -                     status = -ENOMEM;
> -                     break;
> -             }
> -
>               if (unlikely(bytes == 0)) {
>                       status = 0;
>                       copied = 0;
>                       goto zero_length_segment;
>               }
>
> -             status = a_ops->prepare_write(file, page, offset, offset+bytes);
> -             if (unlikely(status)) {
> -                     loff_t isize = i_size_read(inode);
> -
> -                     if (status != AOP_TRUNCATED_PAGE)
> -                             unlock_page(page);
> -                     page_cache_release(page);
> -                     if (status == AOP_TRUNCATED_PAGE)
> -                             continue;
> +             if (data_end == PAGE_CACHE_SIZE || count == bytes) {
>                       /*
> -                      * prepare_write() may have instantiated a few blocks
> -                      * outside i_size.  Trim these off again.
> +                      * Only prepare a write if its an entire page or if
> +                      * we don't have more data
>                        */
> -                     if (pos + bytes > isize)
> -                             vmtruncate(inode, isize);
> -                     break;
> +                     status = a_ops->prepare_write(file, page, data_start, 
> data_end);
> +                     if (unlikely(status)) {
> +                             loff_t isize = i_size_read(inode);
> +
> +                             if (status == AOP_TRUNCATED_PAGE)
> +                                     continue;
> +                             /*
> +                             * prepare_write() may have instantiated a few 
> blocks
> +                             * outside i_size.  Trim these off again.
> +                             */
> +                             if (pos + bytes > isize)
> +                                     vmtruncate(inode, isize);
> +                     }
>               }
>               if (likely(nr_segs == 1))
>                       copied = filemap_copy_from_user(page, offset,
> @@ -2149,60 +2173,86 @@
>               else
>                       copied = filemap_copy_from_user_iovec(page, offset,
>                                               cur_iov, iov_base, bytes);
> -             flush_dcache_page(page);
> -             status = a_ops->commit_write(file, page, offset, offset+bytes);
> -             if (status == AOP_TRUNCATED_PAGE) {
> +
> +             if (data_end == PAGE_CACHE_SIZE || count == bytes) {
> +                     /*
> +                      * Same as above, always try fill pages up to
> +                      * PAGE_CACHE_SIZE if possible (sufficient data)
> +                      */
> +                     flush_dcache_page(page);
> +                     status = a_ops->commit_write(file, page,
> +                                                  data_start, data_end);
> +                     if (status == AOP_TRUNCATED_PAGE) {
> +                             continue;
> +                     }
> +                     unlock_page(page);
> +                     mark_page_accessed(page);
>                       page_cache_release(page);
> -                     continue;
> +                     balance_dirty_pages_ratelimited(mapping);
> +                     cond_resched();
> +                     if (bytes < count) { /* still more iov data to write */
> +                             page = __grab_cache_page(mapping, index + 1,
> +                                                      &cached_page, 
> &lru_pvec);
> +                             if (!page) {
> +                                     status = -ENOMEM;
> +                                     goto out;
> +                             }
> +                     } else {
> +                             page = NULL;
> +                     }
> +                     written += data_end - data_start;
> +                     wpos    += data_end - data_start;
> +                     data_start = 0; /* correct would be data_end % PAGE_SIZE
> +                                      * but if this would be != 0, we don't
> +                                      * have more data
> +                                      */
>               }
>  zero_length_segment:
>               if (likely(copied >= 0)) {
> -                     if (!status)
> -                             status = copied;
> -
> -                     if (status >= 0) {
> -                             written += status;
> -                             count -= status;
> -                             pos += status;
> -                             buf += status;
> +                     if (!status) {
> +                             count -= copied;
> +                             pos += copied;
> +                             buf += copied;
>                               if (unlikely(nr_segs > 1)) {
>                                       filemap_set_next_iovec(&cur_iov,
> -                                                     &iov_base, status);
> +                                                     &iov_base, copied);
>                                       if (count)
>                                               buf = cur_iov->iov_base +
>                                                       iov_base;
>                               } else {
> -                                     iov_base += status;
> +                                     iov_base += copied;
>                               }
>                       }
>               }
>               if (unlikely(copied != bytes))
> -                     if (status >= 0)
> +                     if (!status)
>                               status = -EFAULT;
> -             unlock_page(page);
> -             mark_page_accessed(page);
> -             page_cache_release(page);
> -             if (status < 0)
> +             if (status)
>                       break;
> -             balance_dirty_pages_ratelimited(mapping);
> -             cond_resched();
>       } while (count);
> -     *ppos = pos;
> +
> +out:
> +     *ppos = wpos;
>
>       if (cached_page)
>               page_cache_release(cached_page);
>
> +     if (page) {
> +             unlock_page(page);
> +             page_cache_release(page);
> +     }
> +
>       /*
>        * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
>        */
> -     if (likely(status >= 0)) {
> +     if (likely(!status)) {
>               if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
>                       if (!a_ops->writepage || !is_sync_kiocb(iocb))
>                               status = generic_osync_inode(inode, mapping,
>                                               OSYNC_METADATA|OSYNC_DATA);
>               }
> -     }
> -
> +     }
> +
>       /*
>        * If we get here for O_DIRECT writes then we must have fallen through
>        * to buffered writes (block instantiation inside i_size).  So we sync
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to