I probably should have Cc'ed the netdev list for this patch.

Ed Cashin <ecas...@coraid.com> writes:

> This patch adds the ability to work with large packets composed of a
> number of segments, using the scatter gather feature of the block
> layer (biovecs) and the network layer (skb frag array).  The
> motivation is the performance gained by using a packet data payload
> greater than a page size and by using the network card's scatter
> gather feature.
>
> Users of the out-of-tree aoe driver already had these changes, but
> since early 2011, they have complained of increased memory utilization
> and higher CPU utilization during heavy writes.[1] The commit below
> appears related, as it disables scatter gather on non-IP protocols
> inside the harmonize_features function, even when the NIC supports sg.
>
>   commit f01a5236bd4b140198fbcc550f085e8361fd73fa
>   Author: Jesse Gross <je...@nicira.com>
>   Date:   Sun Jan 9 06:23:31 2011 +0000
>
>       net offloading: Generalize netif_get_vlan_features().
>
> With that regression in place, transmits always linearize sg AoE
> packets, but in-kernel users did not have this patch.  Before 2.6.38,
> though, these changes were working to allow sg to increase
> performance.
>
> 1. http://www.spinics.net/lists/linux-mm/msg15184.html
>
> Signed-off-by: Ed Cashin <ecas...@coraid.com>
> ---
>  drivers/block/aoe/aoe.h    |    2 +
>  drivers/block/aoe/aoeblk.c |    3 +
>  drivers/block/aoe/aoecmd.c |  138 ++++++++++++++++++++++++++++++-------------
>  drivers/block/aoe/aoedev.c |    1 +
>  drivers/block/aoe/aoenet.c |   13 +++-
>  5 files changed, 111 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
> index db195ab..8ca8c8a 100644
> --- a/drivers/block/aoe/aoe.h
> +++ b/drivers/block/aoe/aoe.h
> @@ -119,6 +119,8 @@ struct frame {
>       ulong bcnt;
>       sector_t lba;
>       struct sk_buff *skb;
> +     struct bio_vec *bv;
> +     ulong bv_off;
>  };
>  
>  struct aoeif {
> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
> index 321de7b..1471f81 100644
> --- a/drivers/block/aoe/aoeblk.c
> +++ b/drivers/block/aoe/aoeblk.c
> @@ -279,6 +279,9 @@ aoeblk_gdalloc(void *vp)
>       if (bdi_init(&d->blkq->backing_dev_info))
>               goto err_blkq;
>       spin_lock_irqsave(&d->lock, flags);
> +     blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
> +     d->blkq->backing_dev_info.ra_pages = BLK_DEF_MAX_SECTORS * 1024;
> +     d->blkq->backing_dev_info.ra_pages /= PAGE_CACHE_SIZE;
>       gd->major = AOE_MAJOR;
>       gd->first_minor = d->sysminor * AOE_PARTITIONS;
>       gd->fops = &aoe_bdops;
> diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
> index de0435e..f10ab49 100644
> --- a/drivers/block/aoe/aoecmd.c
> +++ b/drivers/block/aoe/aoecmd.c
> @@ -164,7 +164,8 @@ freeframe(struct aoedev *d)
>                                               rf = f;
>                                       continue;
>                               }
> -gotone:                              skb_shinfo(skb)->nr_frags = 
> skb->data_len = 0;
> +gotone:                              skb->truesize -= skb->data_len;
> +                             skb_shinfo(skb)->nr_frags = skb->data_len = 0;
>                               skb_trim(skb, 0);
>                               d->tgt = t;
>                               ifrotate(*t);
> @@ -200,6 +201,24 @@ gotone:                          
> skb_shinfo(skb)->nr_frags = skb->data_len = 0;
>       return NULL;
>  }
>  
> +static void
> +skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
> +{
> +     int frag = 0;
> +     ulong fcnt;
> +loop:
> +     fcnt = bv->bv_len - (off - bv->bv_offset);
> +     if (fcnt > cnt)
> +             fcnt = cnt;
> +     skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
> +     cnt -= fcnt;
> +     if (cnt <= 0)
> +             return;
> +     bv++;
> +     off = bv->bv_offset;
> +     goto loop;
> +}
> +
>  static int
>  aoecmd_ata_rw(struct aoedev *d)
>  {
> @@ -210,7 +229,7 @@ aoecmd_ata_rw(struct aoedev *d)
>       struct bio_vec *bv;
>       struct aoetgt *t;
>       struct sk_buff *skb;
> -     ulong bcnt;
> +     ulong bcnt, fbcnt;
>       char writebit, extbit;
>  
>       writebit = 0x10;
> @@ -225,8 +244,28 @@ aoecmd_ata_rw(struct aoedev *d)
>       bcnt = t->ifp->maxbcnt;
>       if (bcnt == 0)
>               bcnt = DEFAULTBCNT;
> -     if (bcnt > buf->bv_resid)
> -             bcnt = buf->bv_resid;
> +     if (bcnt > buf->resid)
> +             bcnt = buf->resid;
> +     fbcnt = bcnt;
> +     f->bv = buf->bv;
> +     f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
> +     do {
> +             if (fbcnt < buf->bv_resid) {
> +                     buf->bv_resid -= fbcnt;
> +                     buf->resid -= fbcnt;
> +                     break;
> +             }
> +             fbcnt -= buf->bv_resid;
> +             buf->resid -= buf->bv_resid;
> +             if (buf->resid == 0) {
> +                     d->inprocess = NULL;
> +                     break;
> +             }
> +             buf->bv++;
> +             buf->bv_resid = buf->bv->bv_len;
> +             WARN_ON(buf->bv_resid == 0);
> +     } while (fbcnt);
> +
>       /* initialize the headers & frame */
>       skb = f->skb;
>       h = (struct aoe_hdr *) skb_mac_header(skb);
> @@ -237,7 +276,6 @@ aoecmd_ata_rw(struct aoedev *d)
>       t->nout++;
>       f->waited = 0;
>       f->buf = buf;
> -     f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
>       f->bcnt = bcnt;
>       f->lba = buf->sector;
>  
> @@ -252,10 +290,11 @@ aoecmd_ata_rw(struct aoedev *d)
>               ah->lba3 |= 0xe0;       /* LBA bit + obsolete 0xa0 */
>       }
>       if (bio_data_dir(buf->bio) == WRITE) {
> -             skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
> +             skb_fillup(skb, f->bv, f->bv_off, bcnt);
>               ah->aflags |= AOEAFL_WRITE;
>               skb->len += bcnt;
>               skb->data_len = bcnt;
> +             skb->truesize += bcnt;
>               t->wpkts++;
>       } else {
>               t->rpkts++;
> @@ -266,18 +305,7 @@ aoecmd_ata_rw(struct aoedev *d)
>  
>       /* mark all tracking fields and load out */
>       buf->nframesout += 1;
> -     buf->bv_off += bcnt;
> -     buf->bv_resid -= bcnt;
> -     buf->resid -= bcnt;
>       buf->sector += bcnt >> 9;
> -     if (buf->resid == 0) {
> -             d->inprocess = NULL;
> -     } else if (buf->bv_resid == 0) {
> -             buf->bv = ++bv;
> -             buf->bv_resid = bv->bv_len;
> -             WARN_ON(buf->bv_resid == 0);
> -             buf->bv_off = bv->bv_offset;
> -     }
>  
>       skb->dev = t->ifp->nd;
>       skb = skb_clone(skb, GFP_ATOMIC);
> @@ -364,14 +392,12 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame 
> *f)
>               put_lba(ah, f->lba);
>  
>               n = f->bcnt;
> -             if (n > DEFAULTBCNT)
> -                     n = DEFAULTBCNT;
>               ah->scnt = n >> 9;
>               if (ah->aflags & AOEAFL_WRITE) {
> -                     skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
> -                             offset_in_page(f->bufaddr), n);
> +                     skb_fillup(skb, f->bv, f->bv_off, n);
>                       skb->len = sizeof *h + sizeof *ah + n;
>                       skb->data_len = n;
> +                     skb->truesize += n;
>               }
>       }
>       skb->dev = t->ifp->nd;
> @@ -530,20 +556,6 @@ rexmit_timer(ulong vp)
>                               ejectif(t, ifp);
>                               ifp = NULL;
>                       }
> -
> -                     if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
> -                     && ifp && ++ifp->lostjumbo > (t->nframes << 1)
> -                     && ifp->maxbcnt != DEFAULTBCNT) {
> -                             printk(KERN_INFO
> -                                     "aoe: e%ld.%d: "
> -                                     "too many lost jumbo on "
> -                                     "%s:%pm - "
> -                                     "falling back to %d frames.\n",
> -                                     d->aoemajor, d->aoeminor,
> -                                     ifp->nd->name, t->addr,
> -                                     DEFAULTBCNT);
> -                             ifp->maxbcnt = 0;
> -                     }
>                       resend(d, t, f);
>               }
>  
> @@ -736,6 +748,45 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong 
> duration, sector_t sector
>       part_stat_unlock();
>  }
>  
> +static void
> +bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt)
> +{
> +     ulong fcnt;
> +     char *p;
> +     int soff = 0;
> +loop:
> +     fcnt = bv->bv_len - (off - bv->bv_offset);
> +     if (fcnt > cnt)
> +             fcnt = cnt;
> +     p = page_address(bv->bv_page) + off;
> +     skb_copy_bits(skb, soff, p, fcnt);
> +     soff += fcnt;
> +     cnt -= fcnt;
> +     if (cnt <= 0)
> +             return;
> +     bv++;
> +     off = bv->bv_offset;
> +     goto loop;
> +}
> +
> +static void
> +fadvance(struct frame *f, ulong cnt)
> +{
> +     ulong fcnt;
> +
> +     f->lba += cnt >> 9;
> +loop:
> +     fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset);
> +     if (fcnt > cnt) {
> +             f->bv_off += cnt;
> +             return;
> +     }
> +     cnt -= fcnt;
> +     f->bv++;
> +     f->bv_off = f->bv->bv_offset;
> +     goto loop;
> +}
> +
>  void
>  aoecmd_ata_rsp(struct sk_buff *skb)
>  {
> @@ -753,6 +804,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>       u16 aoemajor;
>  
>       hin = (struct aoe_hdr *) skb_mac_header(skb);
> +     skb_pull(skb, sizeof(*hin));
>       aoemajor = get_unaligned_be16(&hin->major);
>       d = aoedev_by_aoeaddr(aoemajor, hin->minor);
>       if (d == NULL) {
> @@ -790,7 +842,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>  
>       calc_rttavg(d, tsince(f->tag));
>  
> -     ahin = (struct aoe_atahdr *) (hin+1);
> +     ahin = (struct aoe_atahdr *) skb->data;
> +     skb_pull(skb, sizeof(*ahin));
>       hout = (struct aoe_hdr *) skb_mac_header(f->skb);
>       ahout = (struct aoe_atahdr *) (hout+1);
>       buf = f->buf;
> @@ -809,7 +862,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>               switch (ahout->cmdstat) {
>               case ATA_CMD_PIO_READ:
>               case ATA_CMD_PIO_READ_EXT:
> -                     if (skb->len - sizeof *hin - sizeof *ahin < n) {
> +                     if (skb->len < n) {
>                               printk(KERN_ERR
>                                       "aoe: %s.  skb->len=%d need=%ld\n",
>                                       "runt data size in read", skb->len, n);
> @@ -817,7 +870,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>                               spin_unlock_irqrestore(&d->lock, flags);
>                               return;
>                       }
> -                     memcpy(f->bufaddr, ahin+1, n);
> +                     bvcpy(f->bv, f->bv_off, skb, n);
>               case ATA_CMD_PIO_WRITE:
>               case ATA_CMD_PIO_WRITE_EXT:
>                       ifp = getif(t, skb->dev);
> @@ -827,21 +880,22 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>                                       ifp->lostjumbo = 0;
>                       }
>                       if (f->bcnt -= n) {
> -                             f->lba += n >> 9;
> -                             f->bufaddr += n;
> +                             fadvance(f, n);
>                               resend(d, t, f);
>                               goto xmit;
>                       }
>                       break;
>               case ATA_CMD_ID_ATA:
> -                     if (skb->len - sizeof *hin - sizeof *ahin < 512) {
> +                     if (skb->len < 512) {
>                               printk(KERN_INFO
>                                       "aoe: runt data size in ataid.  
> skb->len=%d\n",
>                                       skb->len);
>                               spin_unlock_irqrestore(&d->lock, flags);
>                               return;
>                       }
> -                     ataid_complete(d, t, (char *) (ahin+1));
> +                     if (skb_linearize(skb))
> +                             break;
> +                     ataid_complete(d, t, skb->data);
>                       break;
>               default:
>                       printk(KERN_INFO
> diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
> index 6b5110a..b2d1fd3 100644
> --- a/drivers/block/aoe/aoedev.c
> +++ b/drivers/block/aoe/aoedev.c
> @@ -182,6 +182,7 @@ skbfree(struct sk_buff *skb)
>                       "cannot free skb -- memory leaked.");
>               return;
>       }
> +     skb->truesize -= skb->data_len;
>       skb_shinfo(skb)->nr_frags = skb->data_len = 0;
>       skb_trim(skb, 0);
>       dev_kfree_skb(skb);
> diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
> index 4d3bc0d..0787807 100644
> --- a/drivers/block/aoe/aoenet.c
> +++ b/drivers/block/aoe/aoenet.c
> @@ -102,7 +102,9 @@ static int
>  aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type 
> *pt, struct net_device *orig_dev)
>  {
>       struct aoe_hdr *h;
> +     struct aoe_atahdr *ah;
>       u32 n;
> +     int sn;
>  
>       if (dev_net(ifp) != &init_net)
>               goto exit;
> @@ -110,13 +112,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, 
> struct packet_type *pt,
>       skb = skb_share_check(skb, GFP_ATOMIC);
>       if (skb == NULL)
>               return 0;
> -     if (skb_linearize(skb))
> -             goto exit;
>       if (!is_aoe_netif(ifp))
>               goto exit;
>       skb_push(skb, ETH_HLEN);        /* (1) */
> -
> -     h = (struct aoe_hdr *) skb_mac_header(skb);
> +     sn = sizeof(*h) + sizeof(*ah);
> +     if (skb->len >= sn) {
> +             sn -= skb_headlen(skb);
> +             if (sn > 0 && !__pskb_pull_tail(skb, sn))
> +                     goto exit;
> +     }
> +     h = (struct aoe_hdr *) skb->data;
>       n = get_unaligned_be32(&h->tag);
>       if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
>               goto exit;

-- 
  Ed

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to