On Saturday, February 18, 2017 12:02:14 AM CET Al Viro wrote: > On Fri, Feb 17, 2017 at 05:03:15PM +0000, Al Viro wrote: > > On Fri, Feb 17, 2017 at 10:54:20AM -0500, David Miller wrote: > > > From: Al Viro <v...@zeniv.linux.org.uk> > > > Date: Tue, 14 Feb 2017 01:33:06 +0000 > > > > > > > OK... Remaining interesting question is whether it adds a noticable > > > > overhead. Could somebody try it on assorted benchmarks and see if > > > > it slows the things down? The patch in question follows: > > > > > > That's about a 40 byte copy onto the stack for each invocation of this > > > thing. You can benchmark all you want, but it's clear that this is > > > non-trivial amount of work and will take some operations from fitting > > > in the cache to not doing so for sure. > > > > In principle, that could be reduced a bit (32 bytes - ->type is never > > changed, so we don't need to restore it), but that's not much of > > improvement... > > Actually, I've a better solution. Namely, analogue of iov_iter_advance() > for going backwards. The restriction is that you should never unroll > further than where you've initially started *or* have the iovec, etc. > array modified under you. For iovec/kvec/bio_vec it's trivial, for pipe - > a bit more convoluted, but still doable. Then net/core/datagram.c stuff > could simply use iov_iter_unroll() in case of error - all we need to keep > track of is how much had we copied and that's easy to do. > > The patch below is completely untested, but if it works it should avoid > buggering the fast paths at all, still giving the same semantics re > reverting ->msg_iter both on EINVAL and EFAULT. Comments? I've tested it. It also fixes the corruption issue I can reproduce with my setup.: # tc qdisc add dev eth0 root netem corrupt 0.1% (and the dlbug code)
So, for what's worth: Tested-by: Christian Lamparter <chunk...@googlemail.com> > diff --git a/include/linux/uio.h b/include/linux/uio.h > index 804e34c6f981..237965348bc2 100644 > --- a/include/linux/uio.h > +++ b/include/linux/uio.h > @@ -39,7 +39,10 @@ struct iov_iter { > }; > union { > unsigned long nr_segs; > - int idx; > + struct { > + int idx; > + int start_idx; > + }; > }; > }; > > @@ -81,6 +84,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long > nr_segs, size_t to); > size_t iov_iter_copy_from_user_atomic(struct page *page, > struct iov_iter *i, unsigned long offset, size_t bytes); > void iov_iter_advance(struct iov_iter *i, size_t bytes); > +void iov_iter_unroll(struct iov_iter *i, size_t bytes); > int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); > size_t iov_iter_single_seg_count(const struct iov_iter *i); > size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, > diff --git a/lib/iov_iter.c b/lib/iov_iter.c > index e68604ae3ced..afdaca37f5c9 100644 > --- a/lib/iov_iter.c > +++ b/lib/iov_iter.c > @@ -786,6 +786,68 @@ void iov_iter_advance(struct iov_iter *i, size_t size) > } > EXPORT_SYMBOL(iov_iter_advance); > > +void iov_iter_unroll(struct iov_iter *i, size_t unroll) > +{ > + if (!unroll) > + return; > + i->count += unroll; > + if (unlikely(i->type & ITER_PIPE)) { > + struct pipe_inode_info *pipe = i->pipe; > + int idx = i->idx; > + size_t off = i->iov_offset; > + while (1) { > + size_t n = off - pipe->bufs[idx].offset; > + if (unroll < n) { > + off -= (n - unroll); > + break; > + } > + unroll -= n; > + if (!unroll && idx == i->start_idx) { > + off = 0; > + break; > + } > + if (!idx--) > + idx = pipe->buffers - 1; > + off = pipe->bufs[idx].offset + pipe->bufs[idx].len; > + } > + i->iov_offset = off; > + i->idx = idx; > + pipe_truncate(i); > + return; > + } > + if (unroll <= i->iov_offset) { > + i->iov_offset -= unroll; > + return; > + } > + unroll -= i->iov_offset; > + if (i->type & ITER_BVEC) { > + const struct bio_vec *bvec = i->bvec; > + while (1) { > + size_t n = (--bvec)->bv_len; > + i->nr_segs++; > + if (unroll <= n) { > + i->bvec = bvec; > + i->iov_offset = n - unroll; > + return; > + } > + unroll -= n; > + } > + } else { /* same logics for iovec and kvec */ > + const struct iovec *iov = i->iov; > + while (1) { > + size_t n = (--iov)->iov_len; > + i->nr_segs++; > + if (unroll <= n) { > + i->iov = iov; > + i->iov_offset = n - unroll; > + return; > + } > + unroll -= n; > + } > + } > +} > +EXPORT_SYMBOL(iov_iter_unroll); > + > /* > * Return the count of just the current iov_iter segment. > */ > @@ -839,6 +901,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction, > i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); > i->iov_offset = 0; > i->count = count; > + i->start_idx = i->idx; > } > EXPORT_SYMBOL(iov_iter_pipe); > > diff --git a/net/core/datagram.c b/net/core/datagram.c > index 662bea587165..63c7353a6ba2 100644 > --- a/net/core/datagram.c > +++ b/net/core/datagram.c > @@ -394,7 +394,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int > offset, > struct iov_iter *to, int len) > { > int start = skb_headlen(skb); > - int i, copy = start - offset; > + int i, copy = start - offset, start_off = offset, n; > struct sk_buff *frag_iter; > > trace_skb_copy_datagram_iovec(skb, len); > @@ -403,11 +403,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, > int offset, > if (copy > 0) { > if (copy > len) > copy = len; > - if (copy_to_iter(skb->data + offset, copy, to) != copy) > + n = copy_to_iter(skb->data + offset, copy, to); > + offset += n; > + if (n != copy) > goto short_copy; > if ((len -= copy) == 0) > return 0; > - offset += copy; > } > > /* Copy paged appendix. Hmm... why does this look so complicated? */ > @@ -421,13 +422,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, > int offset, > if ((copy = end - offset) > 0) { > if (copy > len) > copy = len; > - if (copy_page_to_iter(skb_frag_page(frag), > + n = copy_page_to_iter(skb_frag_page(frag), > frag->page_offset + offset - > - start, copy, to) != copy) > + start, copy, to); > + offset += n; > + if (n != copy) > goto short_copy; > if (!(len -= copy)) > return 0; > - offset += copy; > } > start = end; > } > @@ -459,6 +461,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int > offset, > */ > > fault: > + iov_iter_unroll(to, offset - start_off); > return -EFAULT; > > short_copy: > @@ -609,7 +612,7 @@ static int skb_copy_and_csum_datagram(const struct > sk_buff *skb, int offset, > __wsum *csump) > { > int start = skb_headlen(skb); > - int i, copy = start - offset; > + int i, copy = start - offset, start_off = offset; > struct sk_buff *frag_iter; > int pos = 0; > int n; > @@ -619,11 +622,11 @@ static int skb_copy_and_csum_datagram(const struct > sk_buff *skb, int offset, > if (copy > len) > copy = len; > n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); > + offset += n; > if (n != copy) > goto fault; > if ((len -= copy) == 0) > return 0; > - offset += copy; > pos = copy; > } > > @@ -645,12 +648,12 @@ static int skb_copy_and_csum_datagram(const struct > sk_buff *skb, int offset, > offset - start, copy, > &csum2, to); > kunmap(page); > + offset += n; > if (n != copy) > goto fault; > *csump = csum_block_add(*csump, csum2, pos); > if (!(len -= copy)) > return 0; > - offset += copy; > pos += copy; > } > start = end; > @@ -683,6 +686,7 @@ static int skb_copy_and_csum_datagram(const struct > sk_buff *skb, int offset, > return 0; > > fault: > + iov_iter_unroll(to, offset - start_off); > return -EFAULT; > } > > @@ -767,6 +771,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, > } > return 0; > csum_error: > + iov_iter_unroll(&msg->msg_iter, chunk); > return -EINVAL; > fault: > return -EFAULT; >