Re: [PATCH RFC 4/5] tun: vringfd xmit support.

2008-04-06 Thread Herbert Xu
Rusty Russell <[EMAIL PROTECTED]> wrote:
>
> +/* We are done with this skb: put it in the used pile. */
> +static void skb_finished(struct skb_shared_info *sinfo)
> +{
> +   struct skb_shinfo_tun *sht = (void *)(sinfo + 1);
> +
> +   /* FIXME: Race prevention */
> +   vring_used_buffer_atomic(sht->tun->outring, sht->id, sht->len);
> +   vring_wake(sht->tun->outring);
> +
> +   /* Release device. */
> +   dev_put(sht->tun->dev);
> +}

On second thought, this is not going to work.  The network stack
can clone individual pages out of this skb and put it into a new
skb.  Therefore whatever scheme we come up with will either need
to be page-based, or add a flag to tell the network stack that it
can't clone those pages.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[0/6] [NET]: virtio SG/TSO patches

2008-04-17 Thread Herbert Xu
Hi:

Here are the patches I used for testing KVM with virtio-net using
TSO.  There are three patches for the tun device which are basically
Rusty's patches with the mmap turned into copying (for correctness).
Two patches are for the virtio-net frontend, one required to support
receiving SG/TSO, and the other useful for testing SG per se.  The
other patch is to the KVM backend to make all this work.  It isn't
the prettiest or the smartest solution but it was the easiest :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[1/6] [TUN]: Add GSO support

2008-04-17 Thread Herbert Xu
  gso.csum_offset = gso.csum_start = 0;
+   }
+
+   if ((len -= sizeof(gso)) < 0)
+   return -EINVAL;
+
+   if (memcpy_toiovec(iv, (void *)&gso, sizeof(gso)))
+   return -EFAULT;
+   total += sizeof(gso);
+   }
 
len = min_t(int, skb->len, len);
 
@@ -512,6 +708,17 @@ static int tun_set_iff(struct file *file, struct ifreq 
*ifr)
 
tun_net_init(dev);
 
+   /* Virtio header means we can handle csum & gso. */
+   if ((ifr->ifr_flags & (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) ==
+   (IFF_VIRTIO_HDR|IFF_RECV_CSUM)) {
+   dev->features = NETIF_F_SG | NETIF_F_HW_CSUM |
+   NETIF_F_HIGHDMA | NETIF_F_FRAGLIST;
+
+   if (ifr->ifr_flags & IFF_RECV_GSO)
+   dev->features |= NETIF_F_TSO | NETIF_F_UFO |
+NETIF_F_TSO_ECN | NETIF_F_TSO6;
+   }
+
if (strchr(dev->name, '%')) {
err = dev_alloc_name(dev, dev->name);
if (err < 0)
@@ -537,6 +744,21 @@ static int tun_set_iff(struct file *file, struct ifreq 
*ifr)
else
tun->flags &= ~TUN_ONE_QUEUE;
 
+   if (ifr->ifr_flags & IFF_VIRTIO_HDR)
+   tun->flags |= TUN_VIRTIO_HDR;
+   else
+   tun->flags &= ~TUN_VIRTIO_HDR;
+
+   if (ifr->ifr_flags & IFF_RECV_CSUM)
+   tun->flags |= TUN_RECV_CSUM;
+   else
+   tun->flags &= ~TUN_RECV_CSUM;
+
+   if (ifr->ifr_flags & IFF_RECV_GSO)
+   tun->flags |= TUN_RECV_GSO;
+   else
+   tun->flags &= ~TUN_RECV_GSO;
+
file->private_data = tun;
tun->attached = 1;
 
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 72f1c5f..3dbef10 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -72,6 +72,9 @@ struct tun_struct {
 #define TUN_NO_PI  0x0040
 #define TUN_ONE_QUEUE  0x0080
 #define TUN_PERSIST0x0100  
+#define TUN_VIRTIO_HDR 0x0200
+#define TUN_RECV_CSUM  0x0400
+#define TUN_RECV_GSO   0x0400
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -87,6 +90,9 @@ struct tun_struct {
 #define IFF_TAP0x0002
 #define IFF_NO_PI  0x1000
 #define IFF_ONE_QUEUE  0x2000
+#define IFF_VIRTIO_HDR 0x4000
+#define IFF_RECV_CSUM  0x8000
+#define IFF_RECV_GSO   0x0800
 
 struct tun_pi {
unsigned short flags;

-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[2/6] [TUN]: Add GSO detection

2008-04-17 Thread Herbert Xu
This is Rusty's second patch to tun to allow user-space access
to the GSO feature in a backwards compatible way.

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 34a03ec..4c15dc4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -800,6 +800,15 @@ static int tun_chr_ioctl(struct inode *inode, struct file 
*file,
return 0;
}
 
+   if (cmd == TUNGETFEATURES) {
+   /* Currently this just means: "what IFF flags are valid?".
+* This is needed because we never checked for invalid flags on
+* TUNSETIFF.  This was introduced with IFF_GSO_HDR, so if a
+* kernel doesn't have this ioctl, it doesn't have GSO header
+* support. */
+   return put_user(IFF_ALL_FLAGS, (unsigned int __user*)argp);
+   }
+
if (!tun)
return -EBADFD;
 
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3dbef10..fe6855d 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -84,6 +84,7 @@ struct tun_struct {
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK_IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN0x0001
@@ -93,6 +94,8 @@ struct tun_struct {
 #define IFF_VIRTIO_HDR 0x4000
 #define IFF_RECV_CSUM  0x8000
 #define IFF_RECV_GSO   0x0800
+#define IFF_ALL_FLAGS (IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | \
+  IFF_VIRTIO_HDR | IFF_RECV_CSUM | IFF_RECV_GSO)
 
 struct tun_pi {
unsigned short flags;
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[3/6] [TUN]: Fix GSO mapping

2008-04-17 Thread Herbert Xu
This patch avoids the correctness issue on the user-space mapping
by just copying the memory.

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4c15dc4..d75cfd2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -257,65 +258,55 @@ static struct sk_buff *copy_user_skb(size_t align, struct 
iovec *iv, size_t len)
 }
 
 /* This will fail if they give us a crazy iovec, but that's their own fault. */
-static int get_user_skb_frags(const struct iovec *iv, size_t count,
- struct skb_frag_struct *f)
+static int get_user_skb_frags(struct iovec *iv, size_t count,
+ struct skb_shared_info *sinfo)
 {
-   unsigned int i, j, num_pg = 0;
+   struct skb_frag_struct *f = sinfo->frags;
+   unsigned int i;
int err;
-   struct page *pages[MAX_SKB_FRAGS];
 
-   down_read(¤t->mm->mmap_sem);
+   f->page = NULL;
+
for (i = 0; i < count; i++) {
-   int n, npages;
-   unsigned long base, len;
-   base = (unsigned long)iv[i].iov_base;
-   len = (unsigned long)iv[i].iov_len;
+   unsigned int len = iv[i].iov_len;
 
-   if (len == 0)
-   continue;
+   while (len) {
+   void *virt;
+   unsigned int copy;
 
-   /* How many pages will this take? */
-   npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
-   if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
-   err = -ENOSPC;
-   goto fail;
-   }
-   n = get_user_pages(current, current->mm, base, npages,
-  0, 0, pages, NULL);
-   if (unlikely(n < 0)) {
-   err = n;
-   goto fail;
-   }
+   if (!f->page) {
+   f->page = alloc_page(GFP_KERNEL |
+__GFP_HIGHMEM);
+   if (!f->page)
+   return -ENOMEM;
 
-   /* Transfer pages to the frag array */
-   for (j = 0; j < n; j++) {
-   f[num_pg].page = pages[j];
-   if (j == 0) {
-   f[num_pg].page_offset = offset_in_page(base);
-   f[num_pg].size = min(len, PAGE_SIZE -
-f[num_pg].page_offset);
-   } else {
-   f[num_pg].page_offset = 0;
-   f[num_pg].size = min(len, PAGE_SIZE);
+   f->page_offset = 0;
+   f->size = 0;
+   sinfo->nr_frags++;
}
-   len -= f[num_pg].size;
-   base += f[num_pg].size;
-   num_pg++;
-   }
 
-   if (unlikely(n != npages)) {
-   err = -EFAULT;
-   goto fail;
+   copy = PAGE_SIZE - f->size;
+   if (copy > len)
+   copy = len;
+
+   virt = kmap_atomic(f->page, KM_USER0);
+   err = memcpy_fromiovec(virt + f->size, iv, copy);
+   kunmap_atomic(virt, KM_USER0);
+
+   if (err)
+   return err;
+
+   f->size += copy;
+   if (f->size == PAGE_SIZE) {
+   if (sinfo->nr_frags >= MAX_SKB_FRAGS)
+   return -EMSGSIZE;
+   (++f)->page = NULL;
+   }
+   len -= copy;
}
}
-   up_read(¤t->mm->mmap_sem);
-   return num_pg;
 
-fail:
-   for (i = 0; i < num_pg; i++)
-   put_page(f[i].page);
-   up_read(¤t->mm->mmap_sem);
-   return err;
+   return 0;
 }
 
 
@@ -360,13 +351,13 @@ static struct sk_buff *map_user_skb(const struct 
virtio_net_hdr *gso,
goto fail;
}
 
-   err = get_user_skb_frags(iv, count, sinfo->frags);
+   err = get_user_skb_frags(iv, count, sinfo);
if (err < 0)
goto fail;
 
-   sinfo->nr_frags = err;
skb->len += len;
skb->data_len += len;
+   skb->truesize += len;

return skb;
 
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apan

[4/6] [KVM] virtio-net: Add SG/GSO support

2008-04-17 Thread Herbert Xu
-36,6 +36,7 @@ void do_info_network(void);
 
 /* virtio hack for zero copy receive */
 int hack_around_tap(void *opaque);
+int tap_has_gso(void *opaque);
 
 /* NIC info */
 
diff --git a/qemu/vl.c b/qemu/vl.c
index 21c9b53..3f10d0a 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -3962,6 +3962,7 @@ typedef struct TAPState {
 int fd;
 char down_script[1024];
 int no_poll;
+int gso;
 } TAPState;
 
 static int tap_read_poll(void *opaque)
@@ -4019,6 +4020,14 @@ int hack_around_tap(void *opaque)
 return -1;
 }
 
+int tap_has_gso(void *opaque)
+{
+VLANClientState *vc = opaque;
+TAPState *ts = vc->opaque;
+
+return ts ? ts->gso : 0;
+}
+
 /* fd support */
 
 static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
@@ -4038,7 +4047,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
 }
 
 #if defined (_BSD) || defined (__FreeBSD_kernel__)
-static int tap_open(char *ifname, int ifname_size)
+static int tap_open(char *ifname, int ifname_size, int *gso)
 {
 int fd;
 char *dev;
@@ -4180,7 +4189,7 @@ int tap_alloc(char *dev)
 return tap_fd;
 }
 
-static int tap_open(char *ifname, int ifname_size)
+static int tap_open(char *ifname, int ifname_size, int *gso)
 {
 char  dev[10]="";
 int fd;
@@ -4193,18 +4202,30 @@ static int tap_open(char *ifname, int ifname_size)
 return fd;
 }
 #else
-static int tap_open(char *ifname, int ifname_size)
+static int tap_open(char *ifname, int ifname_size, int *gso)
 {
 struct ifreq ifr;
 int fd, ret;
+unsigned int features;
 
 TFR(fd = open("/dev/net/tun", O_RDWR));
 if (fd < 0) {
 fprintf(stderr, "warning: could not open /dev/net/tun: no virtual 
network emulation\n");
 return -1;
 }
+
+if (ioctl(fd, TUNGETFEATURES, &features))
+   features = IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE;
+
 memset(&ifr, 0, sizeof(ifr));
 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+
+if (features & IFF_VIRTIO_HDR && features & IFF_RECV_CSUM &&
+   features & IFF_RECV_GSO) {
+   *gso = 1;
+   ifr.ifr_flags |= IFF_VIRTIO_HDR | IFF_RECV_CSUM | IFF_RECV_GSO;
+}
+
 if (ifname[0] != '\0')
 pstrcpy(ifr.ifr_name, IFNAMSIZ, ifname);
 else
@@ -4262,13 +4283,15 @@ static int net_tap_init(VLANState *vlan, const char 
*ifname1,
 {
 TAPState *s;
 int fd;
+int gso;
 char ifname[128];
 
 if (ifname1 != NULL)
 pstrcpy(ifname, sizeof(ifname), ifname1);
 else
 ifname[0] = '\0';
-TFR(fd = tap_open(ifname, sizeof(ifname)));
+gso = 0;
+TFR(fd = tap_open(ifname, sizeof(ifname), &gso));
 if (fd < 0)
 return -1;
 
@@ -4281,6 +4304,8 @@ static int net_tap_init(VLANState *vlan, const char 
*ifname1,
 s = net_tap_fd_init(vlan, fd);
 if (!s)
 return -1;
+
+s->gso = gso;
 snprintf(s->vc->info_str, sizeof(s->vc->info_str),
  "tap: ifname=%s setup_script=%s", ifname, setup_script);
 if (down_script && strcmp(down_script, "no"))
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[5/6] [VIRTIO] net: Add ethtool ops for SG/GSO

2008-04-17 Thread Herbert Xu
This patch adds some basic ethtool operations to virtio_net so
I could test SG without GSO (which was really useful because TSO
turned out to be buggy :)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b58472c..0b508bb 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -19,6 +19,7 @@
 //#define DEBUG
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -333,6 +334,33 @@ static int virtnet_close(struct net_device *dev)
return 0;
 }
 
+static int virtnet_set_tx_csum(struct net_device *dev, u32 data)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+   struct virtio_device *vdev = vi->vdev;
+
+   if (data && !vdev->config->feature(vdev, VIRTIO_NET_F_CSUM))
+   return -ENOSYS;
+
+   return ethtool_op_set_tx_hw_csum(dev, data);
+}
+
+static struct ethtool_ops virtnet_ethtool_ops =
+{
+   .set_tx_csum = virtnet_set_tx_csum,
+   .set_sg = ethtool_op_set_sg,
+};
+
+static int virtnet_change_mtu(struct net_device *dev, int mtu)
+{
+   int max = 65535 - ETH_HLEN;
+
+   if (mtu > max)
+   return -EINVAL;
+   dev->mtu = mtu;
+   return 0;
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
int err;
@@ -348,10 +376,12 @@ static int virtnet_probe(struct virtio_device *vdev)
dev->open = virtnet_open;
dev->stop = virtnet_close;
dev->hard_start_xmit = start_xmit;
+   dev->change_mtu = virtnet_change_mtu;
dev->features = NETIF_F_HIGHDMA;
 #ifdef CONFIG_NET_POLL_CONTROLLER
dev->poll_controller = virtnet_netpoll;
 #endif
+   SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
SET_NETDEV_DEV(dev, &vdev->dev);
 
/* Do we support "hardware" checksums? */
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[6/6] [VIRTIO] net: Allow receiving SG packets

2008-04-17 Thread Herbert Xu
Finally this patch lets virtio_net receive GSO packets in addition
to sending them.  This can definitely be optimised for the non-GSO
case.  For comparison the Xen approach stores one page in each skb
and uses subsequent skb's pages to construct an SG skb instead of
preallocating the maximum amount of pages per skb.

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b58472c..bc49057 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -73,6 +73,7 @@ static void receive_skb(struct net_device *dev, struct 
sk_buff *skb,
unsigned len)
 {
struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
+   int err;
 
if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
pr_debug("%s: short packet %i\n", dev->name, len);
@@ -80,9 +81,15 @@ static void receive_skb(struct net_device *dev, struct 
sk_buff *skb,
goto drop;
}
len -= sizeof(struct virtio_net_hdr);
-   BUG_ON(len > MAX_PACKET_LEN);
 
-   skb_trim(skb, len);
+   err = pskb_trim(skb, len);
+   if (err) {
+   pr_debug("%s: pskb_trim failed %i %d\n", dev->name, len, err);
+   dev->stats.rx_dropped++;
+   goto drop;
+   }
+   skb->truesize += skb->data_len;
+
skb->protocol = eth_type_trans(skb, dev);
pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
 ntohs(skb->protocol), skb->len, skb->pkt_type);
@@ -142,10 +149,11 @@ drop:
 static void try_fill_recv(struct virtnet_info *vi)
 {
struct sk_buff *skb;
-   struct scatterlist sg[1+MAX_SKB_FRAGS];
+   struct scatterlist sg[2+MAX_SKB_FRAGS];
int num, err;
+   int i;
 
-   sg_init_table(sg, 1+MAX_SKB_FRAGS);
+   sg_init_table(sg, 2+MAX_SKB_FRAGS);
for (;;) {
skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN);
if (unlikely(!skb))
@@ -153,6 +161,21 @@ static void try_fill_recv(struct virtnet_info *vi)
 
skb_put(skb, MAX_PACKET_LEN);
vnet_hdr_to_sg(sg, skb);
+
+   for (i = 0; i < MAX_SKB_FRAGS; i++) {
+   skb_shinfo(skb)->frags[i].page = alloc_page(GFP_ATOMIC);
+   if (!skb_shinfo(skb)->frags[i].page)
+   break;
+
+   skb_shinfo(skb)->frags[i].page_offset = 0;
+   skb_shinfo(skb)->frags[i].size = PAGE_SIZE;
+
+   skb->data_len += PAGE_SIZE;
+   skb->len += PAGE_SIZE;
+
+   skb_shinfo(skb)->nr_frags++;
+   }
+
num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
        skb_queue_head(&vi->recv, skb);
 
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [6/6] [VIRTIO] net: Allow receiving SG packets

2008-04-18 Thread Herbert Xu
On Sat, Apr 19, 2008 at 12:08:04AM +1000, Rusty Russell wrote:
> On Friday 18 April 2008 13:24:27 Herbert Xu wrote:
> > Finally this patch lets virtio_net receive GSO packets in addition
> > to sending them.  This can definitely be optimised for the non-GSO
> > case.  For comparison the Xen approach stores one page in each skb
> > and uses subsequent skb's pages to construct an SG skb instead of
> > preallocating the maximum amount of pages per skb.
> 
> Well, this ends up freeing the unused pages immediately which is actually 
> quite nice.  We can cache them ourselves if we want, but we'd have to see if 
> the page allocation/free overhead is measurable...

I was thinking of the case of a large number of these interfaces
but yeah for one interface it's no biggie :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [5/6] [VIRTIO] net: Add ethtool ops for SG/GSO

2008-04-21 Thread Herbert Xu
On Tue, Apr 22, 2008 at 05:01:46AM +1000, Rusty Russell wrote:
> On Friday 18 April 2008 13:21:42 Herbert Xu wrote:
> > +static int virtnet_change_mtu(struct net_device *dev, int mtu)
> > +{
> > +   int max = 65535 - ETH_HLEN;
> > +
> > +   if (mtu > max)
> > +   return -EINVAL;
> > +   dev->mtu = mtu;
> > +   return 0;
> > +}
> 
> Hi Herbert!
> 
> I removed this part; useful for testing, but we need a feature bit for 
> MTU 
> size in general.  And to change it on the fly either requires a reset & 
> re-init, or some protocol (and feature bit!) for renegotiating MTU.

BTW Rusty this was just a work-in-progress.  When I submit them I
will add sign-offs.

However, the MTU part should be fine as long as the other end supports
SG.  The operative word in MTU is T :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [6/6] [VIRTIO] net: Allow receiving SG packets

2008-04-21 Thread Herbert Xu
On Mon, Apr 21, 2008 at 01:04:18PM -0700, David Miller wrote:
> From: Rusty Russell <[EMAIL PROTECTED]>
> Date: Tue, 22 Apr 2008 05:06:16 +1000
> 
> > I'm not sure what the right number is here.  Say worst case is header which 
> > goes over a page boundary then MAX_SKB_FRAGS in the skb, but for some 
> > reason 
> > that already has a +2:

The +2 (i.e., extra +1) is for the virtio GSO header.
 
> skb->data is linear, therefore it's not possible to need
> more than one scatterlist entry for it.

Theoretically yes :) But for virtualisation the underlying transport
may present meta-physically contiguous memory that is physically
discrete.  So we may actually need to have multiple SG entries for
skb->data.  However, no current code path should generate packets
with both long skb->data areas *and* skb page frags so we could
just drop them if they show up.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] virtio: wean net driver off NETDEV_TX_BUSY

2008-04-28 Thread Herbert Xu
On Tue, Apr 29, 2008 at 01:16:05AM +1000, Rusty Russell wrote:
>
> + /* If we has a buffer left over from last time, send it now. */
> + if (vi->last_xmit_skb) {
> + if (xmit_skb(vi, vi->last_xmit_skb) != 0) {
> + /* Drop this skb: we only queue one. */
> + kfree_skb(skb);
> + goto stop_queue;

We should increment the drop counter here.

Otherwise these patches all look good to me.

Acked-by: Herbert Xu <[EMAIL PROTECTED]>

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/2] lguest: virtio-rng support

2008-05-16 Thread Herbert Xu
On Sat, May 17, 2008 at 04:28:03PM +1000, Rusty Russell wrote:
>
> But you did not address the DoS question: can we ignore it?  Or are we 
> trading 
> off a DoS in the host against a potential security weakness in the guest?

Why not do both? Use the host's urandom to make the guest at least
unpredictable on start-up without increasing the guest's entropy,
and use the host's random to actually increase the guest's entropy.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/3] virtio_net: Fix skb->csum_start computation

2008-05-28 Thread Herbert Xu
On Tue, May 27, 2008 at 12:20:47PM +0100, Mark McLoughlin wrote:
> hdr->csum_start is the offset from the start of the ethernet
> header to the transport layer checksum field. skb->csum_start
> is the offset from skb->head.
> 
> skb_partial_csum_set() assumes that skb->data points to the
> ethernet header - i.e. it computes skb->csum_start by adding
> the headroom to hdr->csum_start.
> 
> Since eth_type_trans() skb_pull()s the ethernet header,
> skb_partial_csum_set() should be called before
> eth_type_trans().
> 
> Signed-off-by: Mark McLoughlin <[EMAIL PROTECTED]>

Good catch! Clearly shows I never ran this across a real Ethernet
device :)

Acked-by: Herbert Xu <[EMAIL PROTECTED]>

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [3/6] [TUN]: Fix GSO mapping

2008-05-29 Thread Herbert Xu
On Thu, May 29, 2008 at 11:32:51AM +0100, Mark McLoughlin wrote:
> 
> Subject: [PATCH 1/1] tun: Do not use kmap_atomic() since memcpy_fromiovec() 
> can sleep
> 
> Signed-off-by: Mark McLoughlin <[EMAIL PROTECTED]>

Good catch!

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/5] virtio net: Add ethtool ops for SG/GSO

2008-07-14 Thread Herbert Xu
On Mon, Jul 14, 2008 at 10:40:49PM -0500, Rusty Russell wrote:
> From: Herbert Xu <[EMAIL PROTECTED]>
> 
> This patch adds some basic ethtool operations to virtio_net so
> I could test SG without GSO (which was really useful because TSO
> turned out to be buggy :)
> 
> Signed-off-by: Rusty Russell <[EMAIL PROTECTED]> (remove MTU setting)

Signed-off-by: Herbert Xu <[EMAIL PROTECTED]>

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 4/5] virtio net: Allow receiving SG packets

2008-07-14 Thread Herbert Xu
On Mon, Jul 14, 2008 at 10:41:38PM -0500, Rusty Russell wrote:
>
> + /* If we can receive ANY GSO packets, we must allocate large ones. */
> + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)
> + || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)
> + || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
> + dev->features |= NETIF_F_LRO;

We don't want to do this because LRO implies joining packets
together in an irreversible way which is not what this is about.

Case in point we're now disabling LRO for devices that are
forwarding packets which is totally unnecessary for virtio.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 4/5] virtio net: Allow receiving SG packets

2008-07-15 Thread Herbert Xu
On Tue, Jul 15, 2008 at 06:25:04PM +1000, Rusty Russell wrote:
>
> Oops.  I grepped for LRO when I did this and found nothing.
> 
> How's this one?

Looks good.  Thanks!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/4] tun: Allow GSO using virtio_net_hdr

2008-07-24 Thread Herbert Xu
On Thu, Jun 26, 2008 at 12:30:37AM +1000, Rusty Russell wrote:
> Add a IFF_VNET_HDR flag.  This uses the same ABI as virtio_net (ie. prepending
> struct virtio_net_hdr to packets) to indicate GSO and checksum information.
> 
> Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>

I just noticed that we still allocate a linear skb even when GSO
is enabled.  Please fix this by allocating page frags where
necessary.  Otherwise GSO is only going to work before memory
fragmentation sets in.

IIRC I'd sent out a patch to the virt mailing list with code
that did this.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/2] virtio_net: Improve the recv buffer allocation scheme

2008-10-09 Thread Herbert Xu
On Thu, Oct 09, 2008 at 11:55:59AM +1100, Rusty Russell wrote:
>
> There are three approaches we should investigate before adding YA feature.  
> Obviously, we can simply increase the number of ring entries.

That's not going to work so well as you need to increase the ring
size by MAX_SKB_FRAGS times to achieve the same level of effect.

Basically the current scheme is either going to suck at non-TSO
traffic or it's going to chew too much resources.

> Secondly, we can put the virtio_net_hdr at the head of the skb data (this is 
> also worth considering for xmit I think if we have headroom) and drop 
> MAX_SKB_FRAGS which contains a gratuitous +2.

That's fine but having skb->data in the ring still means two
different kinds of memory in there and it sucks when you only
have 1500-byte packets.

> Thirdly, we can try to coalesce contiguous buffers.  The page caching scheme 
> we have might help here, I don't know.  Maybe we should be explicitly trying 
> to allocate higher orders.

That's not really the key problem here.  The problem here is
that the scheme we're currently using in virtio-net is simply
broken when it comes to 1500-byte sized packets.  Most of the
entries on the ring buffer go to waste.

We need a scheme that handles both 1500-byte packets as well
as 64K-byte size ones, and without holding down 16M of memory
per guest.

> > The size of the logical buffer is
> > returned to the guest rather than the size of the individual smaller
> > buffers.
> 
> That's a virtio transport breakage: can you use the standard virtio 
> mechanism, 
> just put the extended length or number of extra buffers inside the 
> virtio_net_hdr?

Sure that sounds reasonable.

> > Make use of this support by supplying single page receive buffers to
> > the host. On receive, we extract the virtio_net_hdr, copy 128 bytes of
> > the payload to the skb's linear data buffer and adjust the fragment
> > offset to point to the remaining data. This ensures proper alignment
> > and allows us to not use any paged data for small packets. If the
> > payload occupies multiple pages, we simply append those pages as
> > fragments and free the associated skbs.
> 
> > +   char *p = page_address(skb_shinfo(skb)->frags[0].page);
> ...
> > +   memcpy(hdr, p, sizeof(*hdr));
> > +   p += sizeof(*hdr);
> 
> I think you need kmap_atomic() here to access the page.  And yes, that will 
> effect performance :(

No we don't.  kmap would only be necessary for highmem which we
did not request.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.

2008-12-15 Thread Herbert Xu
Anthony Liguori  wrote:
> 
> If we used TCP, we don't have a useful TCP/IP stack in QEMU, so we'd 
> have to inject that traffic into the host Linux instance, and then 
> receive the traffic in QEMU.  Besides being indirect, it has some nasty 
> security implications that I outlined in my response to Jeremy's last note.

When combined with namespaces I don't see why using the kernel TCP
stack would create any security problems that wouldn't otherwise
exist.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Tue, Apr 14, 2009 at 01:54:30PM +0200, Patrick McHardy wrote:
>
> I see. How about this patch instead? It moves the sk_sleep assignment
> to tun_attach, after the permission checks took place.

Thanks for pointing out this gaping hole in my patch.  I think
it may be better to remove read_wait altogether and just use
socket.wait in tun_struct.

Let me whip up a patch.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 04:36:10PM +0800, Herbert Xu wrote:
> 
> Let me whip up a patch.

tun: Fix sk_sleep races when attaching/detaching

As the sk_sleep wait queue actually lives in tfile, which may be
detached from the tun device, bad things will happen when we use
sk_sleep after detaching.

Since the tun device is the persistent data structure here (when
requested by the user), it makes much more sense to have the wait
queue live there.  There is no reason to have it in tfile at all
since the only time we can wait is if we have a tun attached.
In fact we already have a wait queue in tun_struct, so we might
as well use it.

Reported-by: Christian Borntraeger 
Reported-by: Eric W. Biederman 
Reported-by: Patrick McHardy 
Signed-off-by: Herbert Xu 

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a1b0697..412b578 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -93,7 +93,6 @@ struct tun_file {
atomic_t count;
struct tun_struct *tun;
struct net *net;
-   wait_queue_head_t   read_wait;
 };
 
 struct tun_sock;
@@ -333,7 +332,7 @@ static void tun_net_uninit(struct net_device *dev)
/* Inform the methods they need to stop using the dev.
 */
if (tfile) {
-   wake_up_all(&tfile->read_wait);
+   wake_up_all(&tun->socket.wait);
if (atomic_dec_and_test(&tfile->count))
__tun_detach(tun);
}
@@ -393,7 +392,7 @@ static int tun_net_xmit(struct sk_buff *skb, struct 
net_device *dev)
/* Notify and wake up reader process */
if (tun->flags & TUN_FASYNC)
kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-   wake_up_interruptible(&tun->tfile->read_wait);
+   wake_up_interruptible(&tun->socket.wait);
return 0;
 
 drop:
@@ -490,7 +489,7 @@ static unsigned int tun_chr_poll(struct file *file, 
poll_table * wait)
 
DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
 
-   poll_wait(file, &tfile->read_wait, wait);
+   poll_wait(file, &tun->socket.wait, wait);
 
if (!skb_queue_empty(&tun->readq))
mask |= POLLIN | POLLRDNORM;
@@ -762,7 +761,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const 
struct iovec *iv,
goto out;
}
 
-   add_wait_queue(&tfile->read_wait, &wait);
+   add_wait_queue(&tun->socket.wait, &wait);
while (len) {
current->state = TASK_INTERRUPTIBLE;
 
@@ -793,7 +792,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const 
struct iovec *iv,
}
 
current->state = TASK_RUNNING;
-   remove_wait_queue(&tfile->read_wait, &wait);
+   remove_wait_queue(&tun->socket.wait, &wait);
 
 out:
tun_put(tun);
@@ -861,7 +860,6 @@ static int tun_set_iff(struct net *net, struct file *file, 
struct ifreq *ifr)
struct sock *sk;
struct tun_struct *tun;
struct net_device *dev;
-   struct tun_file *tfile = file->private_data;
int err;
 
dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -921,11 +919,11 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
 
/* This ref count is for tun->sk. */
dev_hold(dev);
+   init_waitqueue_head(&tun->socket.wait);
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_destruct = tun_sock_destruct;
sk->sk_sndbuf = INT_MAX;
-   sk->sk_sleep = &tfile->read_wait;
 
tun->sk = sk;
container_of(sk, struct tun_sock, sk)->tun = tun;
@@ -1265,7 +1263,6 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
atomic_set(&tfile->count, 0);
tfile->tun = NULL;
tfile->net = get_net(current->nsproxy->net_ns);
-   init_waitqueue_head(&tfile->read_wait);
file->private_data = tfile;
return 0;
 }

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 06:23:29AM -0700, Eric W. Biederman wrote:
> 
> There is a GIGANTIC reason to have the wait queue on tfile.
> 
> If you open a file, and do ip link del tapN you can still
> be blocked waiting in poll.
> 
> The problem is specifically free_poll_entry, where we call
> remove_wait_queue and fput without calling any file methods.
> So all of this happens without struct tun_file's count being
> elevated.  Which means tun_net_uninit can detach before we get
> off of the stupid poll wait queue.

What about taking a netdev refcount before calling poll_wait?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 09:46:10PM +0800, Herbert Xu wrote:
>
> Does anything actually rely on this behaviour?

I doubt it :)

> If not we should just change it to not do that.

It appears that this was introduced in

commit c70f182940f988448f3c12a209d18b1edc276e33
Author: Eric W. Biederman 
Date:   Tue Jan 20 11:07:17 2009 +

tun: Fix races between tun_net_close and free_netdev.

Presumably in order to fix the problem of trying to unregister
the same device twice.

I what we should do is to mark the device as dead instead of
detaching if a third party deletes it.  That's all you need
to know to stop close(2) from trying the unregister a device
that's already been unregistered.

What else am I missing?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 06:35:58AM -0700, Eric W. Biederman wrote:
> 
> Because as far as I can tell we would just leak that refcount.
> 
> The poll code does not appear to call back into any of the file
> methods when it frees itself from the wait queue.

OK my suggestion was stupid.

But I still don't see how this race is possible at all.

So process A has a tun fd open and is spinning in poll(2).  Now
process B comes along and deletes that tun device.  Process A's
fd should have a netdev reference that keeps the device and
associated structures alive.

Oh I see what's going on.  We're automatically detaching the
device in uninit.  This is just wrong.  Just because process B
deleted the netdev, process A should not be involutarily detached.

Does anything actually rely on this behaviour?

If not we should just change it to not do that.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 07:06:10AM -0700, Eric W. Biederman wrote:
>
> There is the boring rmmod case that has always existed.
> 
> There is more interesting case of moving your tap device
> into another network namespace.
> 
> In which case there is the possibility of the network namespace
> exiting and destroying all of the virtual network devices before
> we close the file handle.

In that case what's the problem with holding a refcount to the
unregistered device until the process owning the fd closes it?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 07:10:32AM -0700, Eric W. Biederman wrote:
>
> Hopefully my earlier explanation helps.  I will get back to
> you as soon as I can.  But I am off on vacation for the
> rest of the week.

I'm sorry but I don't think we can wait for that.  We might just
have to fix it whatever way we can.  You can unfix it when you
come back :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 07:18:44AM -0700, Eric W. Biederman wrote:
> 
> My gut feel is that the socket needs to live in tun_file.  Instead
> of in tun_struct.  Making that change looked just tricky enough
> I couldn't sort through it when I glanced at the tun code, after I noticed
> you had added a socket.

Referring to tun_file to get sk_sleep is just too error-prone.
Unlike the transmit direction, the receive direction does not
present itself to the easy xmit lock solution that's currently
used to make tun_detach atomic.

The receive callback that currently uses sk_sleep can happen
anywhere.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 07:18:44AM -0700, Eric W. Biederman wrote:
>
> So holding the reference only blocks us indefinitely in
> netdev_wait_allrefs, blocking the network namespace exit, and holding
> net_mutex indefinitely.

OK that's a killer because process A in my previous scenario can
kill the device itself and cause a dead-lock.

So how about this? We replace the dev destructor with our own that
doesn't immediately call free_netdev.  We only call free_netdev once
all tun fd's attached to the device have been closed.

This can be achieved by simply adding a counter to tun_struct.
We'd also change the async detach path to set a marker instead
of detaching.  That marker can then be checked in places like
tun_get.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Lguest] [PATCH 4/5] lguest: use KVM hypercalls

2009-04-15 Thread Herbert Xu
On Wed, Apr 15, 2009 at 07:56:42AM -0700, Eric W. Biederman wrote:
>
> I still have the feeling that putting the socket in tun_file instead
> of in tun_struct might be conceptually cleaner, but one big blob that
> is allocated and destroyed together is certainly easier and a lot
> less racy to deal with.

As I said the difficulty with putting the socket in tun_file
is how do you get it on the RX callback path without introducing
new races.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[2/2] tun: Fix sk_sleep races when attaching/detaching

2009-04-16 Thread Herbert Xu
On Thu, Apr 16, 2009 at 07:08:18PM +0800, Herbert Xu wrote:
>
> tun: Only free a netdev when all tun descriptors are closed

With that patch we can now safely move read_wait.

tun: Fix sk_sleep races when attaching/detaching

As the sk_sleep wait queue actually lives in tfile, which may be
detached from the tun device, bad things will happen when we use
sk_sleep after detaching.

Since the tun device is the persistent data structure here (when
requested by the user), it makes much more sense to have the wait
queue live there.  There is no reason to have it in tfile at all
since the only time we can wait is if we have a tun attached.
In fact we already have a wait queue in tun_struct, so we might
as well use it.

Reported-by: Eric W. Biederman 
Tested-by: Christian Borntraeger 
Tested-by: Patrick McHardy 
Signed-off-by: Herbert Xu 

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 540f829..7cfe3d1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -93,7 +93,6 @@ struct tun_file {
atomic_t count;
struct tun_struct *tun;
struct net *net;
-   wait_queue_head_t   read_wait;
 };
 
 struct tun_sock;
@@ -331,7 +330,7 @@ static void tun_net_uninit(struct net_device *dev)
/* Inform the methods they need to stop using the dev.
 */
if (tfile) {
-   wake_up_all(&tfile->read_wait);
+   wake_up_all(&tun->socket.wait);
if (atomic_dec_and_test(&tfile->count))
__tun_detach(tun);
}
@@ -398,7 +397,7 @@ static int tun_net_xmit(struct sk_buff *skb, struct 
net_device *dev)
/* Notify and wake up reader process */
if (tun->flags & TUN_FASYNC)
kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-   wake_up_interruptible(&tun->tfile->read_wait);
+   wake_up_interruptible(&tun->socket.wait);
return 0;
 
 drop:
@@ -495,7 +494,7 @@ static unsigned int tun_chr_poll(struct file *file, 
poll_table * wait)
 
DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
 
-   poll_wait(file, &tfile->read_wait, wait);
+   poll_wait(file, &tun->socket.wait, wait);
 
if (!skb_queue_empty(&tun->readq))
mask |= POLLIN | POLLRDNORM;
@@ -767,7 +766,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const 
struct iovec *iv,
goto out;
}
 
-   add_wait_queue(&tfile->read_wait, &wait);
+   add_wait_queue(&tun->socket.wait, &wait);
while (len) {
current->state = TASK_INTERRUPTIBLE;
 
@@ -798,7 +797,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const 
struct iovec *iv,
}
 
current->state = TASK_RUNNING;
-   remove_wait_queue(&tfile->read_wait, &wait);
+   remove_wait_queue(&tun->socket.wait, &wait);
 
 out:
tun_put(tun);
@@ -866,7 +865,6 @@ static int tun_set_iff(struct net *net, struct file *file, 
struct ifreq *ifr)
struct sock *sk;
struct tun_struct *tun;
struct net_device *dev;
-   struct tun_file *tfile = file->private_data;
int err;
 
dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -924,11 +922,11 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
if (!sk)
goto err_free_dev;
 
+   init_waitqueue_head(&tun->socket.wait);
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_destruct = tun_sock_destruct;
sk->sk_sndbuf = INT_MAX;
-   sk->sk_sleep = &tfile->read_wait;
 
tun->sk = sk;
container_of(sk, struct tun_sock, sk)->tun = tun;
@@ -1268,7 +1266,6 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
atomic_set(&tfile->count, 0);
tfile->tun = NULL;
tfile->net = get_net(current->nsproxy->net_ns);
-   init_waitqueue_head(&tfile->read_wait);
file->private_data = tfile;
return 0;
 }

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[1/2] tun: Only free a netdev when all tun descriptors are closed

2009-04-16 Thread Herbert Xu
On Wed, Apr 15, 2009 at 10:38:34PM +0800, Herbert Xu wrote:
> 
> So how about this? We replace the dev destructor with our own that
> doesn't immediately call free_netdev.  We only call free_netdev once
> all tun fd's attached to the device have been closed.

Here's the patch.  I'd appreciate if everyone can review it
and see if they can recreate the original race by

1) Starting a process using tun and polls on it;
2) Doing ip tun del tunXXX while the process is polling.

tun: Only free a netdev when all tun descriptors are closed

The commit c70f182940f988448f3c12a209d18b1edc276e33 ("tun: Fix
races between tun_net_close and free_netdev") fixed a race where
an asynchronous deletion of a tun device can hose a poll(2) on
a tun fd attached to that device.

However, this came at the cost of moving the tun wait queue into
the tun file data structure.  The problem with this is that it
imposes restrictions on when and where the tun device can access
the wait queue since the tun file may change at any time due to
detaching and reattaching.

In particular, now that we need to use the wait queue on the
receive path it becomes difficult to properly synchronise this
with the detachment of the tun device.

This patch solves the original race in a different way.  Since
the race is only because the underlying memory gets freed, we
can prevent it simply by ensuring that we don't do that until
all tun descriptors ever attached to the device (even if they
have since be detached because they may still be sitting in poll)
have been closed.

This is done by using reference counting the attached tun file
descriptors.  The refcount in tun->sk has been reappropriated
for this purpose since it was already being used for that, albeit
from the opposite angle.

Note that we no longer zero tfile->tun since tun_get will return
NULL anyway after the refcount on tfile hits zero.  Instead it
represents whether this device has ever been attached to a device.

Signed-off-by: Herbert Xu 

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a1b0697..540f829 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -156,6 +156,7 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file)
tfile->tun = tun;
tun->tfile = tfile;
dev_hold(tun->dev);
+   sock_hold(tun->sk);
atomic_inc(&tfile->count);
 
 out:
@@ -165,11 +166,8 @@ out:
 
 static void __tun_detach(struct tun_struct *tun)
 {
-   struct tun_file *tfile = tun->tfile;
-
/* Detach from net device */
netif_tx_lock_bh(tun->dev);
-   tfile->tun = NULL;
tun->tfile = NULL;
netif_tx_unlock_bh(tun->dev);
 
@@ -339,6 +337,13 @@ static void tun_net_uninit(struct net_device *dev)
}
 }
 
+static void tun_free_netdev(struct net_device *dev)
+{
+   struct tun_struct *tun = netdev_priv(dev);
+
+   sock_put(tun->sk);
+}
+
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
@@ -810,7 +815,7 @@ static void tun_setup(struct net_device *dev)
tun->group = -1;
 
dev->ethtool_ops = &tun_ethtool_ops;
-   dev->destructor = free_netdev;
+   dev->destructor = tun_free_netdev;
 }
 
 /* Trivial set of netlink ops to allow deleting tun or tap
@@ -847,7 +852,7 @@ static void tun_sock_write_space(struct sock *sk)
 
 static void tun_sock_destruct(struct sock *sk)
 {
-   dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
+   free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
 }
 
 static struct proto tun_proto = {
@@ -919,8 +924,6 @@ static int tun_set_iff(struct net *net, struct file *file, 
struct ifreq *ifr)
if (!sk)
goto err_free_dev;
 
-   /* This ref count is for tun->sk. */
-   dev_hold(dev);
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_destruct = tun_sock_destruct;
@@ -1275,20 +1278,18 @@ static int tun_chr_close(struct inode *inode, struct 
file *file)
struct tun_file *tfile = file->private_data;
struct tun_struct *tun = __tun_get(tfile);
 
-
if (tun) {
-   DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);
-
-   rtnl_lock();
-   __tun_detach(tun);
-
/* If desireable, unregister the netdevice. */
-   if (!(tun->flags & TUN_PERSIST)) {
-   sock_put(tun->sk);
-   unregister_netdevice(tun->dev);
-   }
+   if (!(tun->flags & TUN_PERSIST))
+   unregister_netdev(tun->dev);
+   else
+   tun_put(tun);
+   } else
+   tun = tfile->tun;
 
-   rtnl_unlock();
+   if (tun) {
+ 

Re: [1/2] tun: Only free a netdev when all tun descriptors are closed

2009-04-16 Thread Herbert Xu
On Thu, Apr 16, 2009 at 10:57:45PM +0300, Michael S. Tsirkin wrote:
>
> This last bit seems to make a simple test using a non-persistent tap device
> deadlock for me: we don't drop a reference acquired with __tun_get sock
> unregister_netdevice blocks printing unregister_netdevice: waiting for tap0 to
> become free. Usage count = 1.

Ah yes, I'd overlooked the fact that the original code didn't
require the tfile refcount to hit zero.  Now we do.  Here's an
updated version of the first patch.  The second patch should still
work as is.

tun: Only free a netdev when all tun descriptors are closed

The commit c70f182940f988448f3c12a209d18b1edc276e33 ("tun: Fix
races between tun_net_close and free_netdev") fixed a race where
an asynchronous deletion of a tun device can hose a poll(2) on
a tun fd attached to that device.

However, this came at the cost of moving the tun wait queue into
the tun file data structure.  The problem with this is that it
imposes restrictions on when and where the tun device can access
the wait queue since the tun file may change at any time due to
detaching and reattaching.

In particular, now that we need to use the wait queue on the
receive path it becomes difficult to properly synchronise this
with the detachment of the tun device.

This patch solves the original race in a different way.  Since
the race is only because the underlying memory gets freed, we
can prevent it simply by ensuring that we don't do that until
all tun descriptors ever attached to the device (even if they
have since be detached because they may still be sitting in poll)
have been closed.

This is done by using reference counting the attached tun file
descriptors.  The refcount in tun->sk has been reappropriated
for this purpose since it was already being used for that, albeit
from the opposite angle.

Note that we no longer zero tfile->tun since tun_get will return
NULL anyway after the refcount on tfile hits zero.  Instead it
represents whether this device has ever been attached to a device.

Signed-off-by: Herbert Xu 

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a1b0697..32ef0b2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -156,6 +156,7 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file)
tfile->tun = tun;
tun->tfile = tfile;
dev_hold(tun->dev);
+   sock_hold(tun->sk);
atomic_inc(&tfile->count);
 
 out:
@@ -165,11 +166,8 @@ out:
 
 static void __tun_detach(struct tun_struct *tun)
 {
-   struct tun_file *tfile = tun->tfile;
-
/* Detach from net device */
netif_tx_lock_bh(tun->dev);
-   tfile->tun = NULL;
tun->tfile = NULL;
netif_tx_unlock_bh(tun->dev);
 
@@ -339,6 +337,13 @@ static void tun_net_uninit(struct net_device *dev)
}
 }
 
+static void tun_free_netdev(struct net_device *dev)
+{
+   struct tun_struct *tun = netdev_priv(dev);
+
+   sock_put(tun->sk);
+}
+
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
@@ -810,7 +815,7 @@ static void tun_setup(struct net_device *dev)
tun->group = -1;
 
dev->ethtool_ops = &tun_ethtool_ops;
-   dev->destructor = free_netdev;
+   dev->destructor = tun_free_netdev;
 }
 
 /* Trivial set of netlink ops to allow deleting tun or tap
@@ -847,7 +852,7 @@ static void tun_sock_write_space(struct sock *sk)
 
 static void tun_sock_destruct(struct sock *sk)
 {
-   dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
+   free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
 }
 
 static struct proto tun_proto = {
@@ -919,8 +924,6 @@ static int tun_set_iff(struct net *net, struct file *file, 
struct ifreq *ifr)
if (!sk)
goto err_free_dev;
 
-   /* This ref count is for tun->sk. */
-   dev_hold(dev);
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_destruct = tun_sock_destruct;
@@ -1275,20 +1278,17 @@ static int tun_chr_close(struct inode *inode, struct 
file *file)
struct tun_file *tfile = file->private_data;
struct tun_struct *tun = __tun_get(tfile);
 
-
if (tun) {
-   DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);
-
-   rtnl_lock();
-   __tun_detach(tun);
-
/* If desireable, unregister the netdevice. */
-   if (!(tun->flags & TUN_PERSIST)) {
-   sock_put(tun->sk);
-   unregister_netdevice(tun->dev);
-   }
+   if (!(tun->flags & TUN_PERSIST))
+   unregister_netdev(tun->dev);
+   tun_put(tun);
+   } else
+   tun = tfile->tun;
 
-   rtnl_

Re: [1/2] tun: Only free a netdev when all tun descriptors are closed

2009-04-18 Thread Herbert Xu
On Sat, Apr 18, 2009 at 10:09:39PM +0300, Michael S. Tsirkin wrote:
> 
> Does this work with TUN_PERSIST off?
> I haven't tested this, but won't unregister_netdev block forever
> waiting for device reference to become 0? Maybe you want
> 
> + tun_put(tun);
> + if (!(tun->flags & TUN_PERSIST))
> + unregister_netdev(tun->dev);
> 
> or is there a race here?

Good point.  I should've just left the old code alone as it wasn't
broken.  So here is yet another update.

I also fixed a bug on the allocation error handling path.

tun: Only free a netdev when all tun descriptors are closed

The commit c70f182940f988448f3c12a209d18b1edc276e33 ("tun: Fix
races between tun_net_close and free_netdev") fixed a race where
an asynchronous deletion of a tun device can hose a poll(2) on
a tun fd attached to that device.

However, this came at the cost of moving the tun wait queue into
the tun file data structure.  The problem with this is that it
imposes restrictions on when and where the tun device can access
the wait queue since the tun file may change at any time due to
detaching and reattaching.

In particular, now that we need to use the wait queue on the
receive path it becomes difficult to properly synchronise this
with the detachment of the tun device.

This patch solves the original race in a different way.  Since
the race is only because the underlying memory gets freed, we
can prevent it simply by ensuring that we don't do that until
all tun descriptors ever attached to the device (even if they
have since be detached because they may still be sitting in poll)
have been closed.

This is done by using reference counting the attached tun file
descriptors.  The refcount in tun->sk has been reappropriated
for this purpose since it was already being used for that, albeit
from the opposite angle.

Note that we no longer zero tfile->tun since tun_get will return
NULL anyway after the refcount on tfile hits zero.  Instead it
represents whether this device has ever been attached to a device.

Signed-off-by: Herbert Xu 

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 16716ae..95ae40a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -156,6 +156,7 @@ static int tun_attach(struct tun_struct *tun, struct file 
*file)
tfile->tun = tun;
tun->tfile = tfile;
dev_hold(tun->dev);
+   sock_hold(tun->sk);
atomic_inc(&tfile->count);
 
 out:
@@ -165,11 +166,8 @@ out:
 
 static void __tun_detach(struct tun_struct *tun)
 {
-   struct tun_file *tfile = tun->tfile;
-
/* Detach from net device */
netif_tx_lock_bh(tun->dev);
-   tfile->tun = NULL;
tun->tfile = NULL;
netif_tx_unlock_bh(tun->dev);
 
@@ -339,6 +337,13 @@ static void tun_net_uninit(struct net_device *dev)
}
 }
 
+static void tun_free_netdev(struct net_device *dev)
+{
+   struct tun_struct *tun = netdev_priv(dev);
+
+   sock_put(tun->sk);
+}
+
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
@@ -811,7 +816,7 @@ static void tun_setup(struct net_device *dev)
tun->group = -1;
 
dev->ethtool_ops = &tun_ethtool_ops;
-   dev->destructor = free_netdev;
+   dev->destructor = tun_free_netdev;
 }
 
 /* Trivial set of netlink ops to allow deleting tun or tap
@@ -848,7 +853,7 @@ static void tun_sock_write_space(struct sock *sk)
 
 static void tun_sock_destruct(struct sock *sk)
 {
-   dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
+   free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
 }
 
 static struct proto tun_proto = {
@@ -920,11 +925,8 @@ static int tun_set_iff(struct net *net, struct file *file, 
struct ifreq *ifr)
if (!sk)
goto err_free_dev;
 
-   /* This ref count is for tun->sk. */
-   dev_hold(dev);
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
-   sk->sk_destruct = tun_sock_destruct;
sk->sk_sndbuf = INT_MAX;
sk->sk_sleep = &tfile->read_wait;
 
@@ -942,11 +944,13 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
err = -EINVAL;
err = register_netdevice(tun->dev);
if (err < 0)
-   goto err_free_dev;
+   goto err_free_sk;
+
+   sk->sk_destruct = tun_sock_destruct;
 
err = tun_attach(tun, file);
if (err < 0)
-   goto err_free_dev;
+   goto failed;
}
 
DBG(KERN_INFO "%s: tun_set_iff\n", tun->dev->name);
@@ -1284,14 +1288,16 @@ static int tun_chr_close(struct inode *inode, stru

Re: [2/2] tun: Fix sk_sleep races when attaching/detaching

2009-04-20 Thread Herbert Xu
On Thu, Apr 16, 2009 at 07:09:52PM +0800, Herbert Xu wrote:
> 
> tun: Fix sk_sleep races when attaching/detaching

That patch doesn't apply anymore because of contextual changes
caused by the first patch.  Here's an update.

tun: Fix sk_sleep races when attaching/detaching

As the sk_sleep wait queue actually lives in tfile, which may be
detached from the tun device, bad things will happen when we use
sk_sleep after detaching.

Since the tun device is the persistent data structure here (when
requested by the user), it makes much more sense to have the wait
queue live there.  There is no reason to have it in tfile at all
since the only time we can wait is if we have a tun attached.
In fact we already have a wait queue in tun_struct, so we might
as well use it.

Reported-by: Eric W. Biederman 
Tested-by: Christian Borntraeger 
Tested-by: Patrick McHardy 
Signed-off-by: Herbert Xu 

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 95ae40a..735bf41 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -93,7 +93,6 @@ struct tun_file {
atomic_t count;
struct tun_struct *tun;
struct net *net;
-   wait_queue_head_t   read_wait;
 };
 
 struct tun_sock;
@@ -331,7 +330,7 @@ static void tun_net_uninit(struct net_device *dev)
/* Inform the methods they need to stop using the dev.
 */
if (tfile) {
-   wake_up_all(&tfile->read_wait);
+   wake_up_all(&tun->socket.wait);
if (atomic_dec_and_test(&tfile->count))
__tun_detach(tun);
}
@@ -398,7 +397,7 @@ static int tun_net_xmit(struct sk_buff *skb, struct 
net_device *dev)
/* Notify and wake up reader process */
if (tun->flags & TUN_FASYNC)
kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-   wake_up_interruptible(&tun->tfile->read_wait);
+   wake_up_interruptible(&tun->socket.wait);
return 0;
 
 drop:
@@ -495,7 +494,7 @@ static unsigned int tun_chr_poll(struct file *file, 
poll_table * wait)
 
DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);
 
-   poll_wait(file, &tfile->read_wait, wait);
+   poll_wait(file, &tun->socket.wait, wait);
 
if (!skb_queue_empty(&tun->readq))
mask |= POLLIN | POLLRDNORM;
@@ -768,7 +767,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const 
struct iovec *iv,
goto out;
}
 
-   add_wait_queue(&tfile->read_wait, &wait);
+   add_wait_queue(&tun->socket.wait, &wait);
while (len) {
current->state = TASK_INTERRUPTIBLE;
 
@@ -799,7 +798,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const 
struct iovec *iv,
}
 
current->state = TASK_RUNNING;
-   remove_wait_queue(&tfile->read_wait, &wait);
+   remove_wait_queue(&tun->socket.wait, &wait);
 
 out:
tun_put(tun);
@@ -867,7 +866,6 @@ static int tun_set_iff(struct net *net, struct file *file, 
struct ifreq *ifr)
struct sock *sk;
struct tun_struct *tun;
struct net_device *dev;
-   struct tun_file *tfile = file->private_data;
int err;
 
dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -925,10 +923,10 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
if (!sk)
goto err_free_dev;
 
+   init_waitqueue_head(&tun->socket.wait);
sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space;
sk->sk_sndbuf = INT_MAX;
-   sk->sk_sleep = &tfile->read_wait;
 
tun->sk = sk;
container_of(sk, struct tun_sock, sk)->tun = tun;
@@ -1270,7 +1268,6 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
atomic_set(&tfile->count, 0);
tfile->tun = NULL;
tfile->net = get_net(current->nsproxy->net_ns);
-   init_waitqueue_head(&tfile->read_wait);
file->private_data = tfile;
return 0;
 }

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [2/2] tun: Fix sk_sleep races when attaching/detaching

2009-04-20 Thread Herbert Xu
On Mon, Apr 20, 2009 at 02:26:35AM -0700, David Miller wrote:
> 
> Do you think these two patches are ready to go into net-2.6
> now?

I think so.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [1/2] tun: Only free a netdev when all tun descriptors are closed

2009-04-24 Thread Herbert Xu
On Fri, Apr 24, 2009 at 10:55:49AM +0200, Christian Borntraeger wrote:
> Am Thursday 16 April 2009 13:08:18 schrieb Herbert Xu:
>
> > Here's the patch.  I'd appreciate if everyone can review it
> > and see if they can recreate the original race by
> > 
> > 1) Starting a process using tun and polls on it;
> > 2) Doing ip tun del tunXXX while the process is polling.
> > 
> > tun: Only free a netdev when all tun descriptors are closed
> > 
> > The commit c70f182940f988448f3c12a209d18b1edc276e33 ("tun: Fix
> > races between tun_net_close and free_netdev") fixed a race where
> > an asynchronous deletion of a tun device can hose a poll(2) on
> > a tun fd attached to that device.
> 
> Sorry for the late reply, but this patch creates another regression:
> Now TUNSETIFF returns EBUSY on a persistent tap device:
> 
> open("/dev/net/tun", O_RDWR) = 11
> ioctl(11, 0x400454ca, 0x38e2270) = -1 EBUSY (Device or resource busy)

The patch you qouted has been superceded by many versions :)

Can you please test the latest that's in davem's tree?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-02 Thread Herbert Xu
On Fri, May 29, 2009 at 11:46:04PM +0930, Rusty Russell wrote:
> 
> This effectively reverts 99ffc696d10b28580fe93441d627cf290ac4484c
> "virtio: wean net driver off NETDEV_TX_BUSY".
> 
> The complexity of queuing an skb (setting a tasklet to re-xmit) is
> questionable, especially once we get rid of the other reason for the
> tasklet in the next patch.
> 
> If the skb won't fit in the tx queue, just return NETDEV_TX_BUSY.  It
> might be frowned upon, but it's common and not going away any time
> soon.

This looks like a step backwards to me.  If we have to do this
to fix a bug, sure.  But just doing it for the sake of it smells
wrong.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-02 Thread Herbert Xu
On Tue, Jun 02, 2009 at 11:25:57PM +0930, Rusty Russell wrote:
>
> Or, we could just "return NETDEV_TX_BUSY;".  I like that :)

No you should fix it so that you check the queue status after
transmitting a packet so we never get into this state in the
first place.  NETDEV_TX_BUSY is just passing the problem to
someone else, which is not nice at all.

For example, anyone running tcpdump will now see the packet
twice.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-07 Thread Herbert Xu
On Wed, Jun 03, 2009 at 12:47:04PM +0930, Rusty Russell wrote:
> 
> We could figure out if we can take the worst-case packet, and underutilize
> our queue.  And fix the other *67* drivers.

Most of those are for debugging purposes, i.e., they'll never
happen unless the driver is buggy.

> Of course that doesn't even work, because we return NETDEV_TX_BUSY from dev.c!

If and when your driver becomes part of the core and it has to
feed into other drivers, then you can use this argument :)

> "Hi, core netdevs here.  Don't use NETDEV_TX_BUSY.   Yeah, we can't figure out
> how to avoid it either.  But y'know, just hack something together".

No you've misunderstood my complaint.  I'm not trying to get you
to replace NETDEV_TX_BUSY by the equally abhorrent queue in the
driver, I'm saying that you should stop the queue before you get
a packet that overflows by looking at the amount of free queue
space after transmitting each packet.

For most drivers this is easy to do.  What's so different about
virtio-net that makes this impossible?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-13 Thread Herbert Xu
On Sat, Jun 13, 2009 at 10:00:37PM +0930, Rusty Russell wrote:
> 
> But re your comment that the 67 drivers using TX_BUSY are doing it because of 
> driver bugs, that's hard to believe.  It either hardly ever happens (in which 
> case just drop the packet), or it happens (in which case we should handle it 
> correctly).

Most of them just do this:

start_xmit:

if (unlikely(queue is full)) {
/* This should never happen. */
return TX_BUSY;
}

transmit

if (queue is full)
stop queue

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-18 Thread Herbert Xu
On Thu, Jun 18, 2009 at 04:47:50PM +0930, Rusty Russell wrote:
> 
> One question: can netif_queue_stopped() ever be true inside start_xmit?  Some 
> drivers test that, like sun3lance.c.

The driver should never test that because even if it is true
due to driver shutdown the xmit function should ignore it as
the upper layer will wait for it anyway.

> Summary: we still have about 54 in-tree drivers which actually use 
> NETDEV_TX_BUSY for normal paths.  Can I fix it now?

You can fix it but I don't quite understand your results below :)

> sungem.c: Y, N

This driver does the bug check in addition to a race check that
should simply drop the packet instead of queueing.  In fact chances
are the race check is unnecessary anyway.

> fs_enet: N

This is either just a bug check or the driver is broken in that
it should stop the queue when the said condition can be true.

> mace.c: N

Just a bug check.

> sh_eth.c: Y

This driver should check the queue after transmitting, just like
virtio-net :)

So from a totally non-representative sample of 4, my conclusion
is that none of them need TX_BUSY.  Do you have an example that
really needs it?

Anyway, I don't think we should reshape our APIs based on how
broken the existing users are.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-18 Thread Herbert Xu
On Fri, Jun 19, 2009 at 01:07:19PM +0930, Rusty Russell wrote:
>
> You didn't comment on my patch which tried to fix NETDEV_TX_BUSY tho?

I think fixing it would only encourage more drivers to use and
abuse TX_BUSY.  The fundamental problem with TX_BUSY is that
you're doing the check before transmitting a packet instead of
after transmitting it.

Let me explain why this is wrong, beyond the fact that tcpdump
may see the packet twice which you've tried to fix.  The problem
is that requeueing is fundamentally hard.  We use to have this
horrible logic in the schedulers to handle this.  Thankfully that
seems to have been replaced with a single device-level packet
holder shared with GSO.

However, that is still wrong for many packet schedulers.  For
example, if the requeued packet is of a lower priority, and a
higher priority packet comes along, we want the higher priority
packet to preempt the requeued packet.  Right now it just doesn't
happen.

This is not as trivial as it seems because on a busy host this can
happen many times a second.  With TX_BUSY the QoS guarantees are
simply not workable.

BTW you pointed out that GSO also uses TX_BUSY, but that is
different because the packet schedulers treat a GSO packet as
a single entity so there is no issue of preemption.  Also tcpdump
will never see it twice by design.

> e1000/e1000_main.c: fifo bug workaround?

The workaround should work just as well as a stop-queue check
after packet transmission.

> ehea/ehea_main.c: ?

Ahh! The bastard LLTX drivers are still around.  LLTX was the
worst abuse associated with TX_BUSY.  Thankfully not many of them
are left.

The fix is to not use LLTX and use the xmit_lock like normal
drivers.

> starfire.c: "we may not have enough slots even when it seems we do."?

Just replace skb_num_frags with SKB_MAX_FRAGS and move the check
after the transmit.

> tg3.c: tg3_gso_bug

A better solution would in fact be to disable hardware TSO when
we encounter such a packet (and drop the first one).

Because once you get one you're likely to get a lot more.  The
difference between hardware TSO and GSO on a card like tg3 is
negligible anyway.

Alternatively just disable TSO completely on those chips.

Ccing the tg3 maintainer.
 
> We provided an API, people used it.  Constantly trying to disclaim our 
> responsibility for the resulting mess makes me fucking ANGRY.

Where have I disclaimed responsibility? If we were doing that
then we wouldn't be having this discussion.

> We either remove the API, or fix it.  I think fixing it is better, because my 
> driver will be simpler and it's obvious noone wants to rewrite 50 drivers and 
> break several of them.

My preference is obviously in the long term removal of TX_BUSY.
Due to resource constraints that cannot be done immediately.  So
at least we should try to stop its proliferation.

BTW removing TX_BUSY does not mean that your driver has to stay
complicated.  As I have said repeatedly your driver should be
checking the stop-queue condition after transmission, not before.

In fact queueing it in the driver is just as bad as return TX_BUSY!

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-19 Thread Herbert Xu
On Fri, Jun 19, 2009 at 11:20:44PM +0930, Rusty Russell wrote:
>
> Your use of the word guarantee here indicates an idealized concept of QoS 
> which cannot exist on any NIC which has a queue.  We should try to approach 
> the ideal, but understand we cannot reach it.

I'm just pointing out that it's better to not have to do this.

Since TX_BUSY and requeueing the packet is unnecessary in the
first place because we can avoid it by managing the stop-queue
action properly, there is no reason to do this because of its
downsides.

> On the other hand, we're underutilizing the queue to avoid it.  I find that a 
> little embarrassing.

Here's why I think this is not an issue.  If your NIC is high
bandwidth then your ring is going to have to be huge so the
amount that is underutilised (a 64K packet) is tiny.  If your
NIC is low bandwidth then this is where you often need QoS and
in that case you do *NOT* want to fully utilise the HW queue.

> I feel we're being horribly deceptive by giving them a nice API, and upon 
> review, telling them "don't use that".  And it's been ongoing for far too 
> long.

If you look at our API documentation it actually says:

Return codes:
o NETDEV_TX_OK everything ok.
o NETDEV_TX_BUSY Cannot transmit packet, try later
  Usually a bug, means queue start/stop flow control is broken in
  the driver. Note: the driver must NOT put the skb in its DMA ring.
o NETDEV_TX_LOCKED Locking failed, please retry quickly.
  Only valid when NETIF_F_LLTX is set.

So I don't feel too bad :)

> > In fact queueing it in the driver is just as bad as return TX_BUSY!
> 
> Agreed (modulo the tcpdump issue).  And worse, because it's ugly and complex!

The right solution is to stop the queue properly.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-22 Thread Herbert Xu
On Mon, Jun 22, 2009 at 11:16:03AM +0530, Krishna Kumar2 wrote:
>
> I was curious about "queueing it in the driver" part: why is this bad? Do
> you
> anticipate any performance problems, or does it break QoS, or something
> else I
> have missed?

Queueing it in the driver is bad because it is no different than
queueing it at the upper layer, which is what will happen when
you return TX_BUSY.

Because we've ripped out the qdisc requeueing logic (which is
horribly complex and only existed because of TX_BUSY), it means
that higher priority packets cannot preempt a packet that is queued
in this way.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] virtio_net: return NETDEV_TX_BUSY instead of queueing an extra skb.

2009-06-22 Thread Herbert Xu
On Mon, Jun 22, 2009 at 11:25:29AM -0700, Matt Carlson wrote:
>
> As was said elsewhere, from the driver writer's perspective every packet
> that has already been submitted (queued) to the hardware cannot be
> preempted.  Slightly extending that logic doesn't seem like that much of
> a problem, especially if it saves the troublesome requeuing logic higher up.

I think this is pretty much the same as returning TX_BUSY :)

Do you have access to one of these buggy cards? Can you run some
tests to see what the difference between TSO and GSO is on them?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-07-03 Thread Herbert Xu
David Miller  wrote:
>
>> I think I'll do this in the driver for now, and let's revisit doing it 
>> generically later?
> 
> That might be the best course of action for the time being.
> This whole area is a rat's nest.

Calling skb_orphan like this should be forbidden.  Apart from the
problems already raised, it is a sign that the driver is trying to
paper over a more serious issue of not cleaning up skb's timely.

Yes skb_orphan will work for the cases where calling the skb
destructor allows forward progress, but for the cases where you
really need to the skb to be freed (e.g., iSCSI or Xen), this
simply doesn't work.

So anytime someone tries to propose such a solution it is a sign
that they have bigger problems.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-07-03 Thread Herbert Xu
On Fri, Jul 03, 2009 at 08:02:54PM -0700, David Miller wrote:
>
> In particular the case of handling a device without usable TX
> completion event indications is still quite troublesome.

Which particular devices do you have in mind?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-07-04 Thread Herbert Xu
On Fri, Jul 03, 2009 at 08:13:47PM -0700, David Miller wrote:
>
> NIU
> 
> I basically can't defer interrupts because the chip supports
> per-TX-desc interrupt indications but it lacks an "all TX queue sent"
> event.  So if, say, tell it to interrupt every 1/4 of the TX queue
> then up to 1/4 of the queue can have packets "stuck" in there
> if TX activity all of a sudden ceases.

Here's an idea: We let the sender decide whether we need to enable
notification.  This decision would be carried as a flag in the skb.
For example, UDP would set this flag when its socket buffer is close
to capacity.  Routing would set this flag per NAPI run, etc.

Of course you'd ignore this flag completely if the qdisc queue is
non-empty.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-07-04 Thread Herbert Xu
On Sat, Jul 04, 2009 at 03:42:45PM +0800, Herbert Xu wrote:
> 
> Here's an idea: We let the sender decide whether we need to enable
> notification.  This decision would be carried as a flag in the skb.
> For example, UDP would set this flag when its socket buffer is close
> to capacity.  Routing would set this flag per NAPI run, etc.

Actually it doesn't even matter for routing because only those
that are charged by the skb's or the pages care and they're the
only ones that would need to set this.

One potential problem is if the socket is constantly running
close to capacity, but that should only happen if the device
TX queue is also close to capacity which means that the qdisc
queue should be non-empty.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-07-04 Thread Herbert Xu
On Sat, Jul 04, 2009 at 05:09:10PM +0800, Herbert Xu wrote:
>
> One potential problem is if the socket is constantly running
> close to capacity, but that should only happen if the device
> TX queue is also close to capacity which means that the qdisc
> queue should be non-empty.

Here's a another crazy idea:

Let's use dummy TX descriptors to generate an interrupt, either
with or without transmitting an actual packet on the wire depending
on the NIC.

xmit(skb)

if (TX queue contains no interrupting descriptor &&
qdisc is empty)
mark TX descriptor as interrupting

clean()

do work
if (TX queue contains no interrupting descriptor &&
TX queue non-empty &&
qdisc is empty)
send dummy TX descriptor

This is based on the assumption that in the time it takes for
the NIC to process one packet and interrupt us, we can generate
more packets.  Obviously if we can't then even if the NIC had
a queue-empty interrupt capability it wouldn't help.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-07-04 Thread Herbert Xu
On Sun, Jul 05, 2009 at 11:26:58AM +0800, Herbert Xu wrote:
> 
> Here's a another crazy idea:
> 
> Let's use dummy TX descriptors to generate an interrupt, either
> with or without transmitting an actual packet on the wire depending
> on the NIC.

Here's an even crazier idea that doesn't use dummy descriptors.

xmit(skb)

if (TX queue contains no interrupting descriptor &&
qdisc is empty)
mark TX descriptor as interrupting

if (TX queue now contains an interrupting descriptor &&
qdisc len < 2)
stop queue

if (TX ring full)
stop queue

clean()

do work
wake queue as per usual

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/4] net: skb_orphan on dev_hard_start_xmit

2009-08-18 Thread Herbert Xu
On Mon, Aug 17, 2009 at 06:47:12PM -0700, David Miller wrote:
> 
> I'm pretty sure that for normal TCP and UDP workloads, this is just
> going to set the interrupt bit on the first packet that gets into the
> queue, and then not in the rest.
> 
> TCP just loops over packets in the send queue, and at initial state
> the qdisc will be empty.

The scheme I actually tried on the e1000e is not quite the same
as what I had here.  I'm basically just adding descriptors per
usual.

However, I don't give them to the hardware immediately.  Instead
they're held back so that on average we have about three descriptors
in the queue spaced equally that will cause interrupts.  It seems
to work fairly well for netperf-generated TCP/UDP traffic in terms
of generating the right amount of interrupts (once every qlen/3
packets).

I haven't posted anything yet because I ran into a weird problem
with the e1000e.  It was generating a lot more interrupts than
what I'm telling it to do.  Even if I hard-code the interrupts
at 0, qlen/3, 2qlen/2 I still get way more than qlen/3 interrupts
for qlen packets.

This may be related to the fact that e1000e doesn't really have
a way of turning the interrupts off for a given descriptor.  Well
actually it does but it also turns off status reporting which means
that we will never know whether that entry has finished processing.
So I've had to rely on using its TX interrupt delay mechanism as an
approximation of turning interrupt notification off.  Evidently that
is not working in the way I intended it to.

I'm in the process of repeating the same experiment with cxgb3
which hopefully should let me turn interrupts off on descriptors
while still reporting completion status.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

2007-04-23 Thread Herbert Xu
On Mon, Apr 23, 2007 at 02:57:00PM -0700, Jeremy Fitzhardinge wrote:
> Netfront's use of nh.raw and h.raw for storing page+offset is a bit
> hinky, and it breaks with upcoming network stack updates which reduce
> these fields to sub-pointer sizes.  Fortunately, skb offers the "cb"
> field specifically for stashing this kind of info, so use it.
> 
> Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
> Cc: Herbert Xu <[EMAIL PROTECTED]>
> Cc: Chris Wright <[EMAIL PROTECTED]>
> Cc: Christian Limpach <[EMAIL PROTECTED]>

Thanks Jeremy.  The patch looks good.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 23/25] xen: Lockdep fixes for xen-netfront

2007-04-23 Thread Herbert Xu
Jeremy Fitzhardinge <[EMAIL PROTECTED]> wrote:
>
> @@ -1212,10 +1212,10 @@ static int netif_poll(struct net_device 
>int pages_flipped = 0;
>int err;
> 
> -   spin_lock(&np->rx_lock);
> +   spin_lock_bh(&np->rx_lock);
> 
>if (unlikely(!netfront_carrier_ok(np))) {
> -   spin_unlock(&np->rx_lock);
> +   spin_unlock_bh(&np->rx_lock);

You don't need to disable BH in netif_poll since it's always called
with BH disabled.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

2007-04-23 Thread Herbert Xu
On Mon, Apr 23, 2007 at 09:34:55PM -0700, Jeremy Fitzhardinge wrote:
> 
> Could you give netfront an overall review as well?  I know you're
> already pretty familiar with it, but if you could cast a fresh eye over
> it, that would be helpful.

Sure thing.  I'll look over it soon.

Actually there is one thing I'd like to see changed first up: I noticed
that you've stripped out the checksum hack which is in the main Xen tree.
We actually have the code in net-2.6.22 (which is also in mm) that lets
you use CHECKSUM_PARTIAL on received packets without having to do that
hack.

Here's the patch that I've been testing so far.  It's against the Xen
source, but should be easy to adapt to your version as well.

I just thought about this again, and in fact we need this change for
correctness as well as performance.  Because not setting ip_summed
to CHECKSUM_PARTIAL in netfront is not going to stop netback from
sending CHECKSUM_PARTIAL packets to us.  If these packets are then
routed/bridged back to netback, they'll have the wrong checksum.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff -ur linux-2.6.20.noarch/drivers/xen/core/skbuff.c 
linux-2.6.20.i686/drivers/xen/core/skbuff.c
--- linux-2.6.20.noarch/drivers/xen/core/skbuff.c   2007-04-03 
15:26:15.0 +1000
+++ linux-2.6.20.i686/drivers/xen/core/skbuff.c 2007-03-30 21:06:20.0 
+1000
@@ -9,6 +9,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -78,6 +82,37 @@
return skb;
 }
 
+int skb_checksum_setup(struct sk_buff *skb)
+{
+   if (skb->protocol != htons(ETH_P_IP))
+   goto out;
+   skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
+   if (skb->h.raw >= skb->tail)
+   goto out;
+   switch (skb->nh.iph->protocol) {
+   case IPPROTO_TCP:
+   skb->csum_offset = offsetof(struct tcphdr, check);
+   break;
+   case IPPROTO_UDP:
+   skb->csum_offset = offsetof(struct udphdr, check);
+   break;
+   default:
+   if (net_ratelimit())
+   printk(KERN_ERR "Attempting to checksum a non-"
+  "TCP/UDP packet, dropping a protocol"
+  " %d packet", skb->nh.iph->protocol);
+   goto out;
+   }
+   if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
+   goto out;
+   skb->ip_summed = CHECKSUM_PARTIAL;
+
+   return 0;
+out:
+   return -EPROTO;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
 static void skbuff_ctor(void *buf, struct kmem_cache *cachep, unsigned long 
unused)
 {
int order = 0;
diff -ur linux-2.6.20.noarch/drivers/xen/netback/loopback.c 
linux-2.6.20.i686/drivers/xen/netback/loopback.c
--- linux-2.6.20.noarch/drivers/xen/netback/loopback.c  2007-04-03 
15:26:15.0 +1000
+++ linux-2.6.20.i686/drivers/xen/netback/loopback.c2007-03-30 
21:01:02.0 +1000
@@ -149,16 +149,6 @@
np->stats.rx_bytes += skb->len;
np->stats.rx_packets++;
 
-   if (skb->ip_summed == CHECKSUM_PARTIAL) {
-   /* Defer checksum calculation. */
-   skb->proto_csum_blank = 1;
-   /* Must be a local packet: assert its integrity. */
-   skb->proto_data_valid = 1;
-   }
-
-   skb->ip_summed = skb->proto_data_valid ?
-   CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
-
skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
skb->protocol = eth_type_trans(skb, dev);
skb->dev  = dev;
diff -ur linux-2.6.20.noarch/drivers/xen/netback/netback.c 
linux-2.6.20.i686/drivers/xen/netback/netback.c
--- linux-2.6.20.noarch/drivers/xen/netback/netback.c   2007-04-03 
15:26:15.0 +1000
+++ linux-2.6.20.i686/drivers/xen/netback/netback.c 2007-03-31 
21:07:48.0 +1000
@@ -293,7 +293,6 @@
/* Copy only the header fields we use in this driver. */
nskb->dev = skb->dev;
nskb->ip_summed = skb->ip_summed;
-   nskb->proto_data_valid = skb->proto_data_valid;
dev_kfree_skb(skb);
skb = nskb;
}
@@ -666,9 +665,11 @@
id = meta[npo.meta_cons].id;
flags = nr_frags ? NETRXF_more_data : 0;
 
-   if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+   if (skb->ip_summed == CHECKSUM_PARTIAL)
+   /* local packet? */
flags |= NETRXF_csum_blank | NETRXF_data_validated;
-   else 

Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

2007-04-27 Thread Herbert Xu
On Fri, Apr 27, 2007 at 03:19:50PM -0700, Jeremy Fitzhardinge wrote:
> 
> OK, I've been sitting on this in the hope that I'll suddenly see the
> light and work out what you're talking about - but apparently that's not
> going to happen.  So, some questions:
> 
>1. Does this patch change the dom0 <-> domU interface?  or does it
>   fix something that's currently broken?

No it doesn't change the interface at all.

>2. Can just apply the netfront part to the pv_ops kernel, or does it
>   require the corresponding dom0 patch to be applied as well?

They can be applied separately so you don't need the dom0 part for your
tree.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

2007-04-27 Thread Herbert Xu
On Fri, Apr 27, 2007 at 04:27:21PM -0700, Jeremy Fitzhardinge wrote:
> Herbert Xu wrote:
> > They can be applied separately so you don't need the dom0 part for your
> > tree.
> 
> Great, thanks.

BTW, the version I posted to you is missing the following line.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
--- linux-2.6.20.i386/drivers/xen/core/skbuff.c 2007-04-28 15:30:16.0 
+1000
+++ build-2.6.20.i386/drivers/xen/core/skbuff.c 2007-04-28 15:30:52.0 
+1000
@@ -89,6 +89,7 @@
skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
if (skb->h.raw >= skb->tail)
goto out;
+   skb->csum_start = skb->h.raw - skb->head;
switch (skb->nh.iph->protocol) {
case IPPROTO_TCP:
skb->csum_offset = offsetof(struct tcphdr, check);
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 22/25] xen: xen-netfront: use skb.cb for storing private data

2007-04-29 Thread Herbert Xu
On Sun, Apr 29, 2007 at 12:43:33AM -0700, Jeremy Fitzhardinge wrote:
> Herbert Xu wrote:
> > BTW, the version I posted to you is missing the following line.
> >   
> > --- linux-2.6.20.i386/drivers/xen/core/skbuff.c 2007-04-28 
> > 15:30:16.0 +1000
> > +++ build-2.6.20.i386/drivers/xen/core/skbuff.c 2007-04-28 
> > 15:30:52.0 +1000
> > @@ -89,6 +89,7 @@
> > skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
> > if (skb->h.raw >= skb->tail)
> > goto out;
> > +   skb->csum_start = skb->h.raw - skb->head;
> > switch (skb->nh.iph->protocol) {
> > case IPPROTO_TCP:
> > skb->csum_offset = offsetof(struct tcphdr, check);
> >   
> 
> drivers/xen/core/skbuff.c?  What's that?

It's part of the skb_checksum_setup function which we still need
for this because the current netback protocol doesn't pass the
csum_start and csum_offset values along.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [patch 31/32] xen: --- drivers/net/xen-netfront.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)

2007-05-02 Thread Herbert Xu
Jeremy Fitzhardinge <[EMAIL PROTECTED]> wrote:
> ===
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -1213,10 +1213,10 @@ static int netif_poll(struct net_device 

Any reason why xen-netfront isn't just in a single patch? It makes
it a bit hard to review having it scattered around like this.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [patch 25/29] xen: Add the Xen virtual network device driver.

2007-05-05 Thread Herbert Xu
On Sat, May 05, 2007 at 03:05:07AM -0700, Jeremy Fitzhardinge wrote:
>
> Sorry, I forgot about it.  I was waiting to hear back from network
> people about what this is actually for, and whether we really need it. 

We should just change this to use netif_device_attach and
netif_device_detach.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[1/2] [NET] link_watch: Move link watch list into net_device

2007-05-08 Thread Herbert Xu
On Mon, May 07, 2007 at 02:10:27PM -0700, Jeremy Fitzhardinge wrote:
>
> > We should just change this to use netif_device_attach and
> > netif_device_detach.
> 
> Like this?

Sorry, I had forgotten that I've already concluded previously that
this doesn't work because we don't want to prevent the interface
from being brought up (and other reasons).  My memory is failing me :)

So I think the best option now is to get rid of the delay on carrier
on events for everyone.

Here is the first of 2 patches.

[NET] link_watch: Move link watch list into net_device

These days the link watch mechanism is an integral part of the
network subsystem as it manages the carrier status.  So it now
makes sense to allocate some memory for it in net_device rather
than allocating it on demand.

In fact, this is necessary because we can't tolerate a memory
allocation failure since that means we'd have to potentially
throw a link up event away.

It also simplifies the code greatly.

In doing so I discovered a subtle race condition in the use
of singleevent.  This race condition still exists (and is
somewhat magnified) without singleevent but it's now plugged
thanks to an smp_mb__before_clear_bit.

Signed-off-by: Herbert Xu <[EMAIL PROTECTED]>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
a6c194d06da9aed2a8f5a4ea07e3cbf9266db4ef
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3044622..f671cd2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -467,6 +467,8 @@ struct net_device
/* device index hash chain */
struct hlist_node   index_hlist;
 
+   struct net_device   *link_watch_next;
+
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
   NETREG_REGISTERED,   /* completed register_netdevice */
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index e3c26a9..71a35da 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -19,7 +19,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -28,7 +27,6 @@
 
 enum lw_bits {
LW_RUNNING = 0,
-   LW_SE_USED
 };
 
 static unsigned long linkwatch_flags;
@@ -37,17 +35,9 @@ static unsigned long linkwatch_nextevent;
 static void linkwatch_event(struct work_struct *dummy);
 static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
 
-static LIST_HEAD(lweventlist);
+static struct net_device *lweventlist;
 static DEFINE_SPINLOCK(lweventlist_lock);
 
-struct lw_event {
-   struct list_head list;
-   struct net_device *dev;
-};
-
-/* Avoid kmalloc() for most systems */
-static struct lw_event singleevent;
-
 static unsigned char default_operstate(const struct net_device *dev)
 {
if (!netif_carrier_ok(dev))
@@ -90,21 +80,23 @@ static void rfc2863_policy(struct net_device *dev)
 /* Must be called with the rtnl semaphore held */
 void linkwatch_run_queue(void)
 {
-   struct list_head head, *n, *next;
+   struct net_device *next;
 
spin_lock_irq(&lweventlist_lock);
-   list_replace_init(&lweventlist, &head);
+   next = lweventlist;
+   lweventlist = NULL;
spin_unlock_irq(&lweventlist_lock);
 
-   list_for_each_safe(n, next, &head) {
-   struct lw_event *event = list_entry(n, struct lw_event, list);
-   struct net_device *dev = event->dev;
+   while (next) {
+   struct net_device *dev = next;
 
-   if (event == &singleevent) {
-   clear_bit(LW_SE_USED, &linkwatch_flags);
-   } else {
-   kfree(event);
-   }
+   next = dev->link_watch_next;
+
+   /*
+* Make sure the above read is complete since it can be
+* rewritten as soon as we clear the bit below.
+*/
+   smp_mb__before_clear_bit();
 
/* We are about to handle this device,
 * so new events can be accepted
@@ -147,24 +139,12 @@ void linkwatch_fire_event(struct net_device *dev)
 {
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
unsigned long flags;
-   struct lw_event *event;
-
-   if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) {
-   event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC);
-
-   if (unlikely(event == NULL)) {
-   clear_bit(__LINK_STATE_LINKWATCH_PENDING, 
&dev->state);
-   return;
-   }
-   } else {
-   event = &singleevent;
-   }
 
   

[2/2] [NET] link_watch: Remove delay for up even when we're down

2007-05-08 Thread Herbert Xu
[NET]: Remove link_watch delay for up even when we're down

Currently all link carrier events are delayed by up to a second
before they're processed to prevent link storms.  This causes
unnecessary packet loss during that interval.

In fact, we can achieve the same effect in preventing storms by
only delaying down events and unnecssary up events.  The latter
is defined as up events when we're already up.

Signed-off-by: Herbert Xu <[EMAIL PROTECTED]>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 71a35da..b5f4579 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -77,11 +77,52 @@ static void rfc2863_policy(struct net_device *dev)
 }
 
 
-/* Must be called with the rtnl semaphore held */
-void linkwatch_run_queue(void)
+static int linkwatch_urgent_event(struct net_device *dev)
+{
+   return netif_running(dev) && netif_carrier_ok(dev) &&
+  dev->qdisc != dev->qdisc_sleeping;
+}
+
+
+static void linkwatch_add_event(struct net_device *dev)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&lweventlist_lock, flags);
+   dev->link_watch_next = lweventlist;
+   lweventlist = dev;
+   spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
+
+
+static void linkwatch_schedule_work(unsigned long delay)
+{
+   if (test_and_set_bit(LW_RUNNING, &linkwatch_flags))
+   return;
+
+   /* If we wrap around we'll delay it by at most HZ. */
+   if (delay > HZ)
+   delay = 0;
+
+   schedule_delayed_work(&linkwatch_work, delay);
+}
+
+
+static void __linkwatch_run_queue(int urgent_only)
 {
struct net_device *next;
 
+   /*
+* Limit the number of linkwatch events to one
+* per second so that a runaway driver does not
+* cause a storm of messages on the netlink
+* socket.  This limit does not apply to up events
+* while the device qdisc is down.
+*/
+   if (!urgent_only)
+   linkwatch_nextevent = jiffies + HZ;
+   clear_bit(LW_RUNNING, &linkwatch_flags);
+
spin_lock_irq(&lweventlist_lock);
next = lweventlist;
lweventlist = NULL;
@@ -92,6 +133,11 @@ void linkwatch_run_queue(void)
 
next = dev->link_watch_next;
 
+   if (urgent_only && !linkwatch_urgent_event(dev)) {
+   linkwatch_add_event(dev);
+   continue;
+   }
+
/*
 * Make sure the above read is complete since it can be
 * rewritten as soon as we clear the bit below.
@@ -116,21 +162,23 @@ void linkwatch_run_queue(void)
 
dev_put(dev);
}
+
+   if (lweventlist)
+   linkwatch_schedule_work(linkwatch_nextevent - jiffies);
 }
 
 
-static void linkwatch_event(struct work_struct *dummy)
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
 {
-   /* Limit the number of linkwatch events to one
-* per second so that a runaway driver does not
-* cause a storm of messages on the netlink
-* socket
-*/
-   linkwatch_nextevent = jiffies + HZ;
-   clear_bit(LW_RUNNING, &linkwatch_flags);
+   __linkwatch_run_queue(0);
+}
+
 
+static void linkwatch_event(struct work_struct *dummy)
+{
rtnl_lock();
-   linkwatch_run_queue();
+   __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
rtnl_unlock();
 }
 
@@ -138,23 +186,19 @@ static void linkwatch_event(struct work_struct *dummy)
 void linkwatch_fire_event(struct net_device *dev)
 {
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
-   unsigned long flags;
+   unsigned long delay;
 
dev_hold(dev);
 
-   spin_lock_irqsave(&lweventlist_lock, flags);
-   dev->link_watch_next = lweventlist;
-   lweventlist = dev;
-   spin_unlock_irqrestore(&lweventlist_lock, flags);
+   linkwatch_add_event(dev);
 
-   if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) {
-   unsigned long delay = linkwatch_nextevent - jiffies;
+   delay = linkwatch_nextevent - jiffies;
 
-   /* If we wrap around we'll delay it by at most HZ. */
-   if (delay > HZ)
-   delay = 0;
-   schedule_delayed_work(&linkwatch_work, delay);
-   }
+   /* Minimise down-time: drop delay for up event. */
+   if (linkwatch_urgent_event(dev))
+   delay = 0;
+
+ 

Re: [1/2] [NET] link_watch: Move link watch list into net_device

2007-05-09 Thread Herbert Xu
On Tue, May 08, 2007 at 01:19:33PM -0700, Jeremy Fitzhardinge wrote:
> 
> Subject: xen: go back to using normal network stack carriers
> 
> This effectively reverts xen-unstable change
> 14280:42b29f084c31. Herbert has changed the behaviour of the core
> networking to not delay an initial down->up transition, and so the
> timing concern has been solved.
> 
> Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
> Cc: Herbert Xu <[EMAIL PROTECTED]>
> Cc: Keir Fraser <[EMAIL PROTECTED]>

Looks good to me.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [patch 7/9] lguest: the net driver

2007-05-09 Thread Herbert Xu
[EMAIL PROTECTED] wrote:
>
> +   if (desc->features & LGUEST_NET_F_NOCSUM)
> +   dev->features |= NETIF_F_NO_CSUM;

Any reason why you're using NO_CSUM here instead of HW_CSUM?
Practically there is no difference but NO_CSUM could be treated
differently in future and I'm not sure whether such changes would
be desirable in this driver (i.e., not even generating a partial
checksum).

Also, there doesn't seem to be any passing of metadata to the
backend which means that neither NO_CSUM/HW_CSUM can work if
somebody needs to look at the checksum field.  You could use
IP_CSUM if you do the same hack on the backend that Xen does.
Otherwise you'll have to make do with no checksum offload at all.

I think you'd also need a change_mtu function if the SG feature
is going to be of some use since the default Ethernet one limits
the MTU to 1500.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [patch 7/9] lguest: the net driver

2007-05-09 Thread Herbert Xu
Hi Rusty:

On Wed, May 09, 2007 at 09:55:25PM +1000, Rusty Russell wrote:
> 
>   NO_CSUM because it really doesn't need a checksum.  The
> LGUEST_NET_F_NOCSUM is only set for local inter-guest networking.  If
> some guest were to route the packets outside the machine, this would be
> an issue, though ("don't do that").

While I can see that this is good in keeping things simple, I think
it's something that you want to be able to support since the user
may wish to setup a guest as a firewall appliance which would involve
passing packets from another guest to the outside world.
 
> Indeed, that would be a new feature, and is certainly a consideration
> for more efficient inter-guest networking.  However, I consider that
> somewhat cheating: it's nice to know that 1500 doesn't suck too hard.
> 
> Remember, "Features kill puppies!" 8)

Heh :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] Re: [kvm-devel] [PATCH RFC 1/3] virtio infrastructure

2007-06-04 Thread Herbert Xu
On Mon, Jun 04, 2007 at 12:55:25PM +0300, Avi Kivity wrote:
> 
> Networking hardware generally services descriptors in a FIFO manner.  
> virtio may not (for example, it may offload copies of larger packets to 
> a dma engine such as I/OAT, resulting in a delay, but copy smaller 
> packets immediately).  that means that there will be some mismatch 
> between virtio drivers and real hardware drivers.

You're free to do that in the process but before your packets leave
the backend you've got to make sure that they haven't been reordered.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [kvm-devel] [PATCH RFC 1/3] virtio infrastructure

2007-06-04 Thread Herbert Xu
On Mon, Jun 04, 2007 at 02:25:32PM +0300, Avi Kivity wrote:
> 
> OT: Does that hold for bonded interfaces too?

Yes.  By default traffic to the same destination MAC always stick to
one interface.  You could select a layer3+4 hashing policy but even
that guarantees a single flow will stick to one physical interface
unless it contains IP fragments which should never happen for TCP.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [kvm-devel] [PATCH 3/6] virtio net driver

2007-09-21 Thread Herbert Xu
On Fri, Sep 21, 2007 at 02:36:43PM +0200, Christian Borntraeger wrote:
>
> @@ -335,7 +344,7 @@ static void *virtnet_probe(struct device
>   dev->poll = virtnet_poll;
>   dev->hard_start_xmit = start_xmit;
>   dev->weight = 16;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_LLTX;
>   SET_NETDEV_DEV(dev, device);
>  
>   /* Do we support "hardware" checksums? */

Please don't use LLTX in new drivers.  We're trying to get rid
of it since it's

1) unnecessary;
2) causes problems with AF_PACKET seeing things twice.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: LFENCE instruction (was: [rfc][patch 3/3] x86: optimise barriers)

2007-10-16 Thread Herbert Xu
Nick Piggin <[EMAIL PROTECTED]> wrote:
>
> Also, for non-wb memory. I don't think the Intel document referenced
> says anything about this, but the AMD document says that loads can pass
> loads (page 8, rule b).
> 
> This is why our rmb() is still an lfence.

BTW, Xen (in particular, the code in drivers/xen) uses mb/rmb/wmb
instead of smp_mb/smp_rmb/smp_wmb when it accesses memory that's
shared with other Xen domains or the hypervisor.

The reason this is necessary is because even if a Xen domain is
UP the hypervisor might be SMP.

It would be nice if we can have these adopt the new SMP barriers
on x86 instead of the IO ones as they currently do.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/3] virtio: Net header needs gso_hdr_len

2008-01-15 Thread Herbert Xu
Rusty Russell <[EMAIL PROTECTED]> wrote:
> It's far easier to deal with GSO if we don't have to parse the packet
> to figure out the header length.  Add the field to the virtio_net_hdr
> struct (and fix the spaces that somehow crept in there).
> 
> Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>
> ---
> drivers/net/virtio_net.c   |4 +++-
> include/linux/virtio_net.h |   11 ++-
> 2 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff -r 24ef33a4ab14 drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c  Tue Jan 15 16:59:58 2008 +1100
> +++ b/drivers/net/virtio_net.c  Tue Jan 15 21:21:40 2008 +1100
> @@ -126,6 +126,7 @@ static void receive_skb(struct net_devic
>/* Header must be checked, and gso_segs computed. */
>skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
>skb_shinfo(skb)->gso_segs = 0;
> +   skb_set_transport_header(skb, hdr->gso_hdr_len);

Why do we need this? When receiving GSO packets from an untrusted
source the network stack will fill in the transport header offset
after verifying that the headers are sane.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/3] virtio: Net header needs gso_hdr_len

2008-01-22 Thread Herbert Xu
On Wed, Jan 16, 2008 at 03:19:03PM +1100, Rusty Russell wrote:
> > > It's far easier to deal with GSO if we don't have to parse the packet
> > > to figure out the header length.  Add the field to the virtio_net_hdr
> > > struct (and fix the spaces that somehow crept in there).
>
> > Why do we need this? When receiving GSO packets from an untrusted
> > source the network stack will fill in the transport header offset
> > after verifying that the headers are sane.
> 
> Thanks for clarifying; it simplifies things.

Actually now that I've tried your test program I can see that this
field exists not because of GSO, but because of SG.  It tells you
how many bytes you want to put in the skb head as opposed to the
frag array.

So this field is fine with me as long as it is named as such to
avoid confusion since it really has nothing to do with GSO as you
also need it for SG with large MTUs.

I think this is more flexible than the Xen approach where this is
essentially hard-coded to 64 bytes.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/3] virtio: Net header needs gso_hdr_len

2008-01-22 Thread Herbert Xu
On Wed, Jan 23, 2008 at 09:06:14AM +1100, Rusty Russell wrote:
>
> > So this field is fine with me as long as it is named as such to
> > avoid confusion since it really has nothing to do with GSO as you
> > also need it for SG with large MTUs.
> 
> Hmm, how about just "hdr_len" rather than "gso_hdr_len"?

Sounds fine to me.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] crypto: virtio-crypto: call finalize with bh disabled

2023-11-06 Thread Herbert Xu
On Thu, Nov 02, 2023 at 01:01:09PM +, Gonglei (Arei) wrote:
>
> > So I think the core of this question is: Can we call 
> > crypto_finalize_request() in
> > the upper half of the interrupt?

The finalize path originates from the network stack.  So the conditions
are the same as that of the network stack receive side.  That means
hard IRQ paths are unacceptable but softirq is OK.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 12/38] crypto: virtio: Replace deprecated CPU-hotplug functions.

2021-08-12 Thread Herbert Xu
On Tue, Aug 03, 2021 at 04:15:55PM +0200, Sebastian Andrzej Siewior wrote:
> The functions get_online_cpus() and put_online_cpus() have been
> deprecated during the CPU hotplug rework. They map directly to
> cpus_read_lock() and cpus_read_unlock().
> 
> Replace deprecated CPU-hotplug functions with the official version.
> The behavior remains unchanged.
> 
> Cc: Gonglei 
> Cc: "Michael S. Tsirkin" 
> Cc: Jason Wang 
> Cc: Herbert Xu 
> Cc: "David S. Miller" 
> Cc: virtualization@lists.linux-foundation.org
> Cc: linux-cry...@vger.kernel.org
> Signed-off-by: Sebastian Andrzej Siewior 
> ---
>  drivers/crypto/virtio/virtio_crypto_core.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] crypto: virtio - Use helper to set reqsize

2022-11-22 Thread Herbert Xu
The value of reqsize must only be changed through the helper.

Signed-off-by: Herbert Xu 

diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c 
b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
index 168195672e2e..b2979be613b8 100644
--- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
@@ -479,6 +479,9 @@ static int virtio_crypto_rsa_init_tfm(struct 
crypto_akcipher *tfm)
ctx->enginectx.op.prepare_request = NULL;
ctx->enginectx.op.unprepare_request = NULL;
 
+   akcipher_set_reqsize(tfm,
+sizeof(struct virtio_crypto_akcipher_request));
+
return 0;
 }
 
@@ -505,7 +508,6 @@ static struct virtio_crypto_akcipher_algo 
virtio_crypto_akcipher_algs[] = {
.max_size = virtio_crypto_rsa_max_size,
.init = virtio_crypto_rsa_init_tfm,
.exit = virtio_crypto_rsa_exit_tfm,
-   .reqsize = sizeof(struct 
virtio_crypto_akcipher_request),
.base = {
.cra_name = "rsa",
.cra_driver_name = "virtio-crypto-rsa",
@@ -528,7 +530,6 @@ static struct virtio_crypto_akcipher_algo 
virtio_crypto_akcipher_algs[] = {
.max_size = virtio_crypto_rsa_max_size,
.init = virtio_crypto_rsa_init_tfm,
.exit = virtio_crypto_rsa_exit_tfm,
-   .reqsize = sizeof(struct 
virtio_crypto_akcipher_request),
.base = {
.cra_name = "pkcs1pad(rsa,sha1)",
.cra_driver_name = "virtio-pkcs1-rsa-with-sha1",
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] virtio-crypto: Do not use GFP_ATOMIC when not needed

2023-02-10 Thread Herbert Xu
On Sat, Feb 04, 2023 at 09:54:08PM +0100, Christophe JAILLET wrote:
> There is no need to use GFP_ATOMIC here. GFP_KERNEL is already used for
> another memory allocation just the line after.
> 
> Signed-off-by: Christophe JAILLET 
> ---
> This patch is speculative ! ! !
> 
> Maybe it is the other memory allocation that should use GFP_ATOMIC.
> 
> Review with care !

Looks correct to me.  We don't support calling akcipher in atomic
contexts.

> ---
>  drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 8/9] x86/crypto: eliminate anonymous module_init & module_exit

2022-04-08 Thread Herbert Xu
On Wed, Mar 16, 2022 at 12:20:09PM -0700, Randy Dunlap wrote:
> Eliminate anonymous module_init() and module_exit(), which can lead to
> confusion or ambiguity when reading System.map, crashes/oops/bugs,
> or an initcall_debug log.
> 
> Give each of these init and exit functions unique driver-specific
> names to eliminate the anonymous names.
> 
> Example 1: (System.map)
>  832fc78c t init
>  832fc79e t init
>  832fc8f8 t init
> 
> Example 2: (initcall_debug log)
>  calling  init+0x0/0x12 @ 1
>  initcall init+0x0/0x12 returned 0 after 15 usecs
>  calling  init+0x0/0x60 @ 1
>  initcall init+0x0/0x60 returned 0 after 2 usecs
>  calling  init+0x0/0x9a @ 1
>  initcall init+0x0/0x9a returned 0 after 74 usecs
> 
> Fixes: 64b94ceae8c1 ("crypto: blowfish - add x86_64 assembly implementation")
> Fixes: 676a38046f4f ("crypto: camellia-x86_64 - module init/exit functions 
> should be static")
> Fixes: 0b95ec56ae19 ("crypto: camellia - add assembler implementation for 
> x86_64")
> Fixes: 56d76c96a9f3 ("crypto: serpent - add AVX2/x86_64 assembler 
> implementation of serpent cipher")
> Fixes: b9f535ffe38f ("[CRYPTO] twofish: i586 assembly version")
> Fixes: ff0a70fe0536 ("crypto: twofish-x86_64-3way - module init/exit 
> functions should be static")
> Fixes: 8280daad436e ("crypto: twofish - add 3-way parallel x86_64 assembler 
> implemention")
> Signed-off-by: Randy Dunlap 
> Cc: Jussi Kivilinna 
> Cc: Joachim Fritschi 
> Cc: Herbert Xu 
> Cc: "David S. Miller" 
> Cc: linux-cry...@vger.kernel.org
> Cc: x...@kernel.org
> ---
>  arch/x86/crypto/blowfish_glue.c |8 
>  arch/x86/crypto/camellia_glue.c |8 
>  arch/x86/crypto/serpent_avx2_glue.c |8 ----
>  arch/x86/crypto/twofish_glue.c  |8 
>  arch/x86/crypto/twofish_glue_3way.c |8 
>  5 files changed, 20 insertions(+), 20 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 2/2] virtio: rng: ensure reads happen after successful probe

2014-07-10 Thread Herbert Xu
On Wed, Jul 09, 2014 at 12:18:36PM -0400, Jason Cooper wrote:
> On Sat, Jul 05, 2014 at 11:04:53AM +0530, Amit Shah wrote:
> > The hwrng core asks for random data in the hwrng_register() call itself
> > from commit d9e7972619.  This doesn't play well with virtio -- the
> > DRIVER_OK bit is only set by virtio core on a successful probe, and
> > we're not yet out of our probe routine when this call is made.  This
> > causes the host to not acknowledge any requests we put in the virtqueue,
> > and the insmod or kernel boot process just waits for data to arrive from
> > the host, which never happens.
> > 
> > CC: Kees Cook 
> > CC: Jason Cooper 
> > CC: Herbert Xu 
> > CC:  # For v3.15+
> > Signed-off-by: Amit Shah 
> > ---
> >  drivers/char/hw_random/core.c   |  6 ++
> >  drivers/char/hw_random/virtio-rng.c | 10 ++
> >  2 files changed, 16 insertions(+)
> 
> Yeah, I don't think there's any viable way to get random data out of
> virtio-rng at probe time...  :-(
> 
> Reviewed-by: Jason Cooper 

OK, if there are no more objections I will take these two patches.

Thanks!
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 0/2] hwrng, virtio-rng: init-time fixes

2014-07-14 Thread Herbert Xu
On Thu, Jul 10, 2014 at 03:42:33PM +0530, Amit Shah wrote:
> v3:
>  - Kees Cook pointed out a weird side-effect: devices which have
>->init() registered get their randomness added to the system each
>time they're switched in, but devices that don't have the init
>callback don't contribute to system randomness more than once.  The
>weirdness is resolved here by using the randomness each time
>hwrng_init() is attempted, irrespective of the existence of the
>device's ->init() callback.

All applied to crypto.  Thanks!
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 0/2] hwrng, virtio-rng: init-time fixes

2014-07-14 Thread Herbert Xu
On Tue, Jul 15, 2014 at 10:10:28AM +0530, Amit Shah wrote:
> On (Mon) 14 Jul 2014 [20:50:06], Herbert Xu wrote:
> > On Thu, Jul 10, 2014 at 03:42:33PM +0530, Amit Shah wrote:
> > > v3:
> > >  - Kees Cook pointed out a weird side-effect: devices which have
> > >->init() registered get their randomness added to the system each
> > >time they're switched in, but devices that don't have the init
> > >callback don't contribute to system randomness more than once.  The
> > >weirdness is resolved here by using the randomness each time
> > >hwrng_init() is attempted, irrespective of the existence of the
> > >device's ->init() callback.
> > 
> > All applied to crypto.  Thanks!
> 
> Thanks, Herbert.  I didn't mention it, but pls queue it up for 3.16.

Yes that's why it's in crypto as opposed to cryptodev.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH 1/3] hw_random: allow RNG devices to give early randomness after a delay

2014-07-18 Thread Herbert Xu
On Fri, Jul 18, 2014 at 02:26:26PM +0530, Amit Shah wrote:
>
> > Sounds like a good idea to me.  Though, changes in core.c that
> > increase the time in hwrng_register() or hwrng_init() may not get
> > noticed by rng drivers and they may suddenly start failing for no
> > apparent reason.  Seems like a far stretch, though.  Does anyone else
> > have an opinion on this?
> 
> Herbert, do you have any preference?

So it's only virtio-rng that's a problem, right? How about if we
abuse the scan hook in virtio and move the hwrng_register there?

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 3/4] virtio: rng: delay hwrng_register() till driver is ready

2014-07-21 Thread Herbert Xu
On Mon, Jul 21, 2014 at 08:11:16AM -0400, Jason Cooper wrote:
>
> > @@ -136,15 +137,6 @@ static int probe_common(struct virtio_device *vdev)
> > return err;
> > }
> >  
> > -   err = hwrng_register(&vi->hwrng);
> > -   if (err) {
> > -   vdev->config->del_vqs(vdev);
> > -   vi->vq = NULL;
> > -   kfree(vi);
> > -   ida_simple_remove(&rng_index_ida, index);
> > -   return err;
> > -   }
> > -
> 
> This needs to stay.  register, and failure to do so, should occur in the
> probe routine.

Why? Probing involves finding out whether the hardware is present.
While hwrng_register is about adding an entry in the hwrng system
which may only be one aspect of the given hardware.

For example, the same hardware may support other features that are
used outside of the hwrng system, e.g., crypto.

So there's nothing special about probe that requires us to have the
hwrng_register call there.  In practice, the only difference between
having it in probe vs. scan is that you can return errors.  The only
error that can be returned is ENOMEM which isn't of much interest to
the caller of probe anyway.

On the other hand, if you are calling hwrng_register you better be
damn sure that your hardware is ready to answer requests from the
hwrng system.  Please don't add silly flags to work around this.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 0/3] fix stuck in accessing hwrng attributes

2014-09-17 Thread Herbert Xu
On Tue, Sep 16, 2014 at 12:02:26AM +0800, Amos Kong wrote:
> If we read hwrng by long-running dd process, it takes too much cpu
> time and almost hold the mutex lock. When we check hwrng attributes
> from sysfs by cat, it gets stuck in waiting the lock releaseing.
> The problem can only be reproduced with non-smp guest with slow backend.
> 
> This patchset resolves the issue by changing rng_dev_read() to always
> schedule 10 jiffies after release mutex lock, then cat process can
> have chance to get the lock and execute protected code without stuck.

Sorry I'm not going to accept your fix which simply papers over
the problem.

Please bite the bullet and convert this over to RCU.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 4/6] hw_random: fix unregister race.

2014-10-21 Thread Herbert Xu
On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote:
> From: Rusty Russell 
> 
> The previous patch added one potential problem: we can still be
> reading from a hwrng when it's unregistered.  Add a wait for zero
> in the hwrng_unregister path.
> 
> Signed-off-by: Rusty Russell 

You totally corrupted Rusty's patch.  If you're going to repost
his series you better make sure that you've got the right patches.

Just as well though as it made me think a little more about this
patch :)

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/5] hw_random: fix unregister race.

2014-10-21 Thread Herbert Xu
On Thu, Sep 18, 2014 at 12:18:24PM +0930, Rusty Russell wrote:
> The previous patch added one potential problem: we can still be
> reading from a hwrng when it's unregistered.  Add a wait for zero
> in the hwrng_unregister path.
> 
> Signed-off-by: Rusty Russell 
> ---
>  drivers/char/hw_random/core.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
> index dc9092a1075d..b4a21e9521cf 100644
> --- a/drivers/char/hw_random/core.c
> +++ b/drivers/char/hw_random/core.c
> @@ -60,6 +60,7 @@ static DEFINE_MUTEX(rng_mutex);
>  static DEFINE_MUTEX(reading_mutex);
>  static int data_avail;
>  static u8 *rng_buffer, *rng_fillbuf;
> +static DECLARE_WAIT_QUEUE_HEAD(rng_done);
>  static unsigned short current_quality;
>  static unsigned short default_quality; /* = 0; default to "off" */
>  
> @@ -98,6 +99,7 @@ static inline void cleanup_rng(struct kref *kref)
>  
>   if (rng->cleanup)
>   rng->cleanup(rng);
> + wake_up_all(&rng_done);
>  }
>  
>  static void set_current_rng(struct hwrng *rng)
> @@ -529,6 +531,9 @@ void hwrng_unregister(struct hwrng *rng)
>   }
>  
>   mutex_unlock(&rng_mutex);
> +
> + /* Just in case rng is reading right now, wait. */
> + wait_event(rng_done, atomic_read(&rng->ref.refcount) == 0);

While it's obviously better than what we have now, I don't believe
this is 100% safe as the cleanup function might still be running
even after the ref count hits zero.  Once we return from this function
the module may be unloaded so we need to ensure that nothing is
running at this point.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 4/6] hw_random: fix unregister race.

2014-10-31 Thread Herbert Xu
On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote:
> Herbert Xu  writes:
> > On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote:
> >> From: Rusty Russell 
> >> 
> >> The previous patch added one potential problem: we can still be
> >> reading from a hwrng when it's unregistered.  Add a wait for zero
> >> in the hwrng_unregister path.
> >> 
> >> Signed-off-by: Rusty Russell 
> >
> > You totally corrupted Rusty's patch.  If you're going to repost
> > his series you better make sure that you've got the right patches.
> >
> > Just as well though as it made me think a little more about this
> > patch :)
> 
> OK Amos, can you please repost the complete series?

Please fix the unregister race I identified first.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 4/6] hw_random: fix unregister race.

2014-11-02 Thread Herbert Xu
On Sun, Nov 02, 2014 at 11:06:13PM +0800, Amos Kong wrote:
> On Fri, Oct 31, 2014 at 03:23:21PM +0800, Herbert Xu wrote:
> > On Fri, Oct 31, 2014 at 10:28:00AM +1030, Rusty Russell wrote:
> > > Herbert Xu  writes:
> > > > On Thu, Sep 18, 2014 at 08:37:45PM +0800, Amos Kong wrote:
> > > >> From: Rusty Russell 
> > > >> 
> > > >> The previous patch added one potential problem: we can still be
> > > >> reading from a hwrng when it's unregistered.  Add a wait for zero
> > > >> in the hwrng_unregister path.
> > > >> 
> > > >> Signed-off-by: Rusty Russell 
> > > >
> > > > You totally corrupted Rusty's patch.  If you're going to repost
> > > > his series you better make sure that you've got the right patches.
> > > >
> > > > Just as well though as it made me think a little more about this
> > > > patch :)
> > > 
> > > OK Amos, can you please repost the complete series?
> > 
> > Please fix the unregister race I identified first.
> 
> Ok, I redownload the patches from 
> https://patchwork.kernel.org/project/LKML/list/?submitter=5
> and rebase fixes of me and rusty on it. I will post a V4 later. Thanks.

Please fix the unregister race I pointed out before doing a v4.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 4/6] hw_random: fix unregister race.

2014-11-10 Thread Herbert Xu
On Mon, Nov 03, 2014 at 11:56:24PM +0800, Amos Kong wrote:
>
> @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref)
>  
>   if (rng->cleanup)
>   rng->cleanup(rng);

You need a compiler barrier here to prevent reordering.

> + rng->cleanup_done = true;

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 4/6] hw_random: fix unregister race.

2014-11-11 Thread Herbert Xu
On Mon, Nov 10, 2014 at 09:47:27PM +0800, Herbert Xu wrote:
> On Mon, Nov 03, 2014 at 11:56:24PM +0800, Amos Kong wrote:
> >
> > @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref)
> >  
> > if (rng->cleanup)
> > rng->cleanup(rng);
> 
> You need a compiler barrier here to prevent reordering.

Michael Büsch pointed out that we should actually have a memory
barrier here.  I thought we didn't need it because I was only
worried about the code in my original complaint.  However, expecting
driver writers to use correct synchronisation primitives is surely
asking too much.

So let's add an smp_wmb() here to ensure all side-effects of
cleanup is visible.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 0/6] fix hw_random stuck

2014-12-05 Thread Herbert Xu
On Sat, Dec 06, 2014 at 12:16:36PM +0800, Amos Kong wrote:
> When I hotunplug a busy virtio-rng device or try to access
> hwrng attributes in non-smp guest, it gets stuck.

Please resend these via the linux-crypto mailing list so they
can be picked up by patchwork.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 8/9] virtio_pci: split out legacy device support

2014-12-16 Thread Herbert Xu
Michael S. Tsirkin  wrote:
> Move everything dealing with legacy devices out to virtio_pci_legacy.c.
> Expose common code APIs in virtio_pci.h
> 
> Signed-off-by: Michael S. Tsirkin 

This breaks virtio_pci because you deleted the licence from it.

-- >8 --
Subject: virtio_pci: Restore module licence and other attributes

When the virtio_pci driver was moved into virtio_pci_legacy.c the
module licence and other attributes went AWOL.  This patch restores
them.

Signed-off-by: Herbert Xu 

diff --git a/drivers/virtio/virtio_pci_legacy.c 
b/drivers/virtio/virtio_pci_legacy.c
index 2588252..6b100e3 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -324,3 +324,8 @@ static struct pci_driver virtio_pci_driver = {
 };
 
 module_pci_driver(virtio_pci_driver);
+
+MODULE_AUTHOR("Anthony Liguori ");
+MODULE_DESCRIPTION("virtio-pci");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 REPOST 0/6] fix hw_random stuck

2014-12-22 Thread Herbert Xu
On Mon, Dec 08, 2014 at 04:50:34PM +0800, Amos Kong wrote:
> When I hotunplug a busy virtio-rng device or try to access
> hwrng attributes in non-smp guest, it gets stuck.
> 
> My hotplug tests:
> 
> | test 0:
> |   hotunplug rng device from qemu monitor
> |
> | test 1:
> |   guest) # dd if=/dev/hwrng of=/dev/null &
> |   hotunplug rng device from qemu monitor
> |
> | test 2:
> |   guest) # dd if=/dev/random of=/dev/null &
> |   hotunplug rng device from qemu monitor
> |
> | test 4:
> |   guest) # dd if=/dev/hwrng of=/dev/null &
> |   cat /sys/devices/virtual/misc/hw_random/rng_*
> |
> | test 5:
> |   guest) # dd if=/dev/hwrng of=/dev/null
> |   cancel dd process after 10 seconds
> |   guest) # dd if=/dev/hwrng of=/dev/null &
> |   hotunplug rng device from qemu monitor
> |
> | test 6:
> |   use a fifo as rng backend, execute test 0 ~ 5 with no input of fifo

All applied.  Thanks a lot!
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 06/35] crypto: Use kmemdup rather than duplicating its implementation

2019-07-26 Thread Herbert Xu
Fuqian Huang  wrote:
> kmemdup is introduced to duplicate a region of memory in a neat way.
> Rather than kmalloc/kzalloc + memcpy, which the programmer needs to
> write the size twice (sometimes lead to mistakes), kmemdup improves
> readability, leads to smaller code and also reduce the chances of mistakes.
> Suggestion to use kmemdup rather than using kmalloc/kzalloc + memcpy.
> 
> Signed-off-by: Fuqian Huang 
> ---
> Changes in v2:
>  - Fix a typo in commit message (memset -> memcpy)
> 
> drivers/crypto/caam/caampkc.c  | 11 +++
> drivers/crypto/virtio/virtio_crypto_algs.c |  4 +---
> 2 files changed, 4 insertions(+), 11 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 0/6] crypto: engine - Permit to enqueue all async requests

2018-02-15 Thread Herbert Xu
On Fri, Jan 26, 2018 at 08:15:28PM +0100, Corentin Labbe wrote:
> Hello
> 
> The current crypto_engine support only ahash and ablkcipher request.
> My first patch which try to add skcipher was Nacked, it will add too many 
> functions
> and adding other algs(aead, asymetric_key) will make the situation worst.
> 
> This patchset remove all algs specific stuff and now only process generic 
> crypto_async_request.
> 
> The requests handler function pointer are now moved out of struct engine and
> are now stored directly in a crypto_engine_reqctx.
> 
> The original proposal of Herbert [1] cannot be done completly since the 
> crypto_engine
> could only dequeue crypto_async_request and it is impossible to access any 
> request_ctx
> without knowing the underlying request type.
> 
> So I do something near that was requested: adding crypto_engine_reqctx in TFM 
> context.
> Note that the current implementation expect that crypto_engine_reqctx
> is the first member of the context.
> 
> The first patch is a try to document the crypto engine API.
> The second patch convert the crypto engine with the new way,
> while the following patchs convert the 4 existing users of crypto_engine.
> Note that this split break bisection, so probably the final commit will be 
> all merged.
> 
> Appart from virtio, all 4 latest patch were compile tested only.
> But the crypto engine is tested with my new sun8i-ce driver.
> 
> Regards
> 
> [1] https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1474434.html
> 
> Changes since V1:
> - renamed crypto_engine_reqctx to crypto_engine_ctx
> - indentation fix in function parameter
> - do not export crypto_transfer_request
> - Add aead support
> - crypto_finalize_request is now static
> 
> Changes since RFC:
> - Added a documentation patch
> - Added patch for stm32-cryp
> - Changed parameter of all crypto_engine_op functions from
>   crypto_async_request to void*
> - Reintroduced crypto_transfer_xxx_request_to_engine functions
> 
> Corentin Labbe (6):
>   Documentation: crypto: document crypto engine API
>   crypto: engine - Permit to enqueue all async requests
>   crypto: omap: convert to new crypto engine API
>   crypto: virtio: convert to new crypto engine API
>   crypto: stm32-hash: convert to the new crypto engine API
>   crypto: stm32-cryp: convert to the new crypto engine API

All applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 0/6] crypto: engine - Permit to enqueue all async requests

2018-02-16 Thread Herbert Xu
On Fri, Feb 16, 2018 at 04:36:56PM +0100, Corentin Labbe wrote:
>
> As mentionned in the cover letter, all patchs (except documentation one) 
> should be squashed.
> A kbuild robot reported a build error on cryptodev due to this.

It's too late now.  In future if you want the patches to be squashed
then please send them in one email.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] crypto: virtio - remove dependency on CRYPTO_AUTHENC

2018-03-16 Thread Herbert Xu
On Wed, Mar 07, 2018 at 12:53:15AM +0100, Peter Wu wrote:
> virtio_crypto does not use function crypto_authenc_extractkeys, remove
> this unnecessary dependency. Compiles fine and passes cryptodev-linux
> cipher and speed tests from https://wiki.qemu.org/Features/VirtioCrypto
> 
> Fixes: dbaf0624ffa5 ("crypto: add virtio-crypto driver")
> Signed-off-by: Peter Wu 

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] crypto: virtio: Replace GFP_ATOMIC with GFP_KERNEL in __virtio_crypto_ablkcipher_do_req()

2018-08-03 Thread Herbert Xu
On Mon, Jul 23, 2018 at 04:43:46PM +0800, Jia-Ju Bai wrote:
> __virtio_crypto_ablkcipher_do_req() is never called in atomic context.
> 
> __virtio_crypto_ablkcipher_do_req() is only called by 
> virtio_crypto_ablkcipher_crypt_req(), which is only called by 
> virtcrypto_find_vqs() that is never called in atomic context.
> 
> __virtio_crypto_ablkcipher_do_req() calls kzalloc_node() with GFP_ATOMIC,
> which is not necessary.
> GFP_ATOMIC can be replaced with GFP_KERNEL.
> 
> This is found by a static analysis tool named DCNS written by myself.
> I also manually check the kernel code before reporting it.
> 
> Signed-off-by: Jia-Ju Bai 

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-18 Thread Herbert Xu
Paul E. McKenney  wrote:
>
> You could use SYNC_ACQUIRE() to implement read_barrier_depends() and
> smp_read_barrier_depends(), but SYNC_RMB probably does not suffice.
> The reason for this is that smp_read_barrier_depends() must order the
> pointer load against any subsequent read or write through a dereference
> of that pointer.  For example:
> 
>p = READ_ONCE(gp);
>smp_rmb();
>r1 = p->a; /* ordered by smp_rmb(). */
>p->b = 42; /* NOT ordered by smp_rmb(), BUG!!! */
>r2 = x; /* ordered by smp_rmb(), but doesn't need to be. */
> 
> In contrast:
> 
>p = READ_ONCE(gp);
>smp_read_barrier_depends();
>r1 = p->a; /* ordered by smp_read_barrier_depends(). */
>p->b = 42; /* ordered by smp_read_barrier_depends(). */
>r2 = x; /* not ordered by smp_read_barrier_depends(), which is OK. */
> 
> Again, if your hardware maintains local ordering for address
> and data dependencies, you can have read_barrier_depends() and
> smp_read_barrier_depends() be no-ops like they are for most
> architectures.
> 
> Does that help?

This is crazy! smp_rmb started out being strictly stronger than
smp_read_barrier_depends, when did this stop being the case?
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


  1   2   >