On 06/03/2013 07:09 PM, Michael S. Tsirkin wrote: > On Mon, Jun 03, 2013 at 01:20:58PM +0800, Jason Wang wrote: >> On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote: >>> On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote: >>>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable >>>> or >>>> enable a queue of macvtap. This is used to be compatible at API layer of >>>> tuntap >>>> to simplify the userspace to manage the queues. >>>> >>>> This is done by split the taps array into three different areas: >>>> >>>> - [0, numvtaps) : enabled taps >>>> - [numvtaps, numvtaps + numdisabled) : disabled taps >>>> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots >>>> >>>> When a tap were enabled and disabled, it was moved to another area. >>>> >>>> Signed-off-by: Jason Wang <jasow...@redhat.com> >>> This seems a bit tricky. Can we just move the tap out of the >>> array? >> Certainly yes. >>> the only reason we have the array is for fast >>> lookup on xmit. >>> What's the reason to keep disabled queues there? >> It saves us some space and make code simpler. >>> To be able to track all queues for cleanups, all we need is >>> a linked list of all queues (enabled and disabled). >> Yes, but you need iterate both arrays and linked list which won't be >> simpler than keeping them in place. > No, my idea is to keep all taps in the list. > > If you need all taps, walks the list. > If you need active taps, look them up in the array. > > Reasonable?
Looks so, will change in next version. > >>>> --- >>>> drivers/net/macvtap.c | 167 >>>> ++++++++++++++++++++++++++++++++++++++++---- >>>> include/linux/if_macvlan.h | 7 ++ >>>> 2 files changed, 159 insertions(+), 15 deletions(-) >>>> >>>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c >>>> index eac49cb..03b781c 100644 >>>> --- a/drivers/net/macvtap.c >>>> +++ b/drivers/net/macvtap.c >>>> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops; >>>> */ >>>> static DEFINE_SPINLOCK(macvtap_lock); >>>> >>>> -static int macvtap_set_queue(struct net_device *dev, struct file *file, >>>> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b) >>>> +{ >>>> + struct macvtap_queue *q1, *q2; >>>> + >>>> + if (a == b) >>>> + return; >>>> + >>>> + q1 = rcu_dereference_protected(vlan->taps[a], >>>> + lockdep_is_held(&macvtap_lock)); >>>> + q2 = rcu_dereference_protected(vlan->taps[b], >>>> + lockdep_is_held(&macvtap_lock)); >>>> + >>>> + BUG_ON(q1 == NULL || q2 == NULL); >>>> + >>>> + rcu_assign_pointer(vlan->taps[a], q2); >>>> + rcu_assign_pointer(vlan->taps[b], q1); >>>> + >>>> + q1->queue_index = b; >>>> + q2->queue_index = a; >>>> +} >>>> + >>>> +static int macvtap_enable_queue(struct net_device *dev, struct file *file, >>>> struct macvtap_queue *q) >>>> { >>>> struct macvlan_dev *vlan = netdev_priv(dev); >>>> + int err = -EINVAL; >>>> + int total; >>>> + >>>> + spin_lock(&macvtap_lock); >>>> + total = vlan->numvtaps + vlan->numdisabled; >>>> + >>>> + if (q->queue_index < vlan->numvtaps) >>>> + goto out; >>>> + >>>> + err = 0; >>>> + >>>> + BUG_ON(q->queue_index >= total); >>>> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps); >>>> + >>>> + /* Make sure the pointers were seen before indices. */ >>>> + wmb(); >>> Which indices? We only care about numvtaps right? >>> So let's just say so. >> ok >>> Why is this wmb and not smp_wmb()? >> will correct it. >>>> + >>>> + vlan->numdisabled--; >>>> + vlan->numvtaps++; >>>> +out: >>>> + spin_unlock(&macvtap_lock); >>>> + return err; >>>> +} >>>> + >>>> +static int macvtap_set_queue(struct net_device *dev, struct file *file, >>>> + struct macvtap_queue *q) >>>> +{ >>>> + struct macvlan_dev *vlan = netdev_priv(dev); >>>> int err = -EBUSY; >>>> + int total; >>>> >>>> spin_lock(&macvtap_lock); >>>> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES) >>>> + >>>> + total = vlan->numvtaps + vlan->numdisabled; >>>> + if (total == MAX_MACVTAP_QUEUES) >>>> goto out; >>>> >>>> err = 0; >>>> + >>>> rcu_assign_pointer(q->vlan, vlan); >>>> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q); >>>> + rcu_assign_pointer(vlan->taps[total], q); >>>> sock_hold(&q->sk); >>>> >>>> q->file = file; >>>> - q->queue_index = vlan->numvtaps; >>>> + q->queue_index = total; >>>> file->private_data = q; >>>> + if (vlan->numdisabled) >>>> + macvtap_swap_slot(vlan, vlan->numvtaps, total); >>>> >>>> - vlan->numvtaps++; >>>> + /* Make sure the pointers were seen before indices. */ >>>> + wmb(); >>>> >>>> + vlan->numvtaps++; >>>> out: >>>> spin_unlock(&macvtap_lock); >>>> return err; >>>> } >>>> >>>> +static int macvtap_disable_queue(struct macvtap_queue *q) >>>> +{ >>>> + struct macvlan_dev *vlan; >>>> + int err = -EINVAL; >>>> + >>>> + spin_lock(&macvtap_lock); >>>> + vlan = rcu_dereference_protected(q->vlan, >>>> + lockdep_is_held(&macvtap_lock)); >>>> + >>>> + if (vlan) { >>>> + int total = vlan->numvtaps + vlan->numdisabled; >>>> + int index = q->queue_index; >>>> + >>>> + BUG_ON(q->queue_index >= total); >>>> + if (q->queue_index >= vlan->numvtaps) >>>> + goto out; >>>> + >>>> + err = 0; >>>> + macvtap_swap_slot(vlan, index, total - 1); >>>> + if (vlan->numdisabled) >>>> + /* If there's disabled taps, the above swap will cause >>>> + * a disabled tap to be moved to enabled area. So >>>> + * another swap is needed to keep the right order. >>>> + */ >>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1); >>>> + >>>> + /* make sure the pointers were seen before indices */ >>>> + wmb(); >>> Hmm this looks questionable. The code near rmb first >>> checks numvtaps then dereferences the queue. >>> So it might see a new queue but old value of numvtaps. >> Right, barriers here were just best effort to reduce the possibility of >> wrong queue selection when changing the number of queues. > If this is an optimization, I'd say benchmark it and > see if it helps performance. > > I don't expect it to have any effect really. > In fact, the re-ordering of queues that this patch does > is likely to cause packet reorering and hurt performance > more. Yes, so I will remove the barriers. The re-ordering seems to be the easiest way to do fast lookup of active queues. We could use indirection to avoid the re-ordering of queues, it's hard to eliminate OOO packets. If we don't depends on changing the number of queues frequently, we're ok. > > I'm guessing the only thing we need for correctness > is ACCESS_ONCE on numvtaps? Did't see how it help. > >>>> + >>>> + vlan->numvtaps--; >>>> + vlan->numdisabled++; >>>> + } >>>> + >>>> +out: >>>> + spin_unlock(&macvtap_lock); >>>> + return err; >>>> +} >>>> /* >>>> * The file owning the queue got closed, give up both >>>> * the reference that the files holds as well as the >>>> @@ -121,25 +215,38 @@ out: >>>> */ >>>> static void macvtap_put_queue(struct macvtap_queue *q) >>>> { >>>> - struct macvtap_queue *nq; >>>> struct macvlan_dev *vlan; >>>> >>>> spin_lock(&macvtap_lock); >>>> vlan = rcu_dereference_protected(q->vlan, >>>> lockdep_is_held(&macvtap_lock)); >>>> + >>>> if (vlan) { >>>> + int total = vlan->numvtaps + vlan->numdisabled; >>>> int index = q->queue_index; >>>> - BUG_ON(index >= vlan->numvtaps); >>>> + bool disabled = q->queue_index >= vlan->numvtaps; >>>> + >>>> + BUG_ON(q->queue_index >= total); >>>> + macvtap_swap_slot(vlan, index, total - 1); >>>> + if (!disabled && vlan->numdisabled) >>>> + /* If there's disabled taps, the above swap will cause >>>> + * a disabled tap to be moved to enabled area. So >>>> + * another swap is needed to keep the right order. >>>> + */ >>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1); >>>> + >>>> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL); >>>> + RCU_INIT_POINTER(q->vlan, NULL); >>>> + sock_put(&q->sk); >>>> >>>> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1], >>>> - lockdep_is_held(&macvtap_lock)); >>>> - rcu_assign_pointer(vlan->taps[index], nq); >>>> - nq->queue_index = index; >>>> + /* Make sure the pointers were seen before indices */ >>> Here it's one pointer, right? >> Right. >>>> + wmb(); >>> Same issue as above, looks even more worrying >>> as queue is freed here. >> The read side were protected by rcu_read_lock(), so no worries here. > Okay so basically numvtaps is just a hint, > it can be wrong and nothing too bad happens? > > OK, but let's document this. Sure. > >>>> >>>> - RCU_INIT_POINTER(q->vlan, NULL); >>>> + if (disabled) >>>> + vlan->numdisabled--; >>>> + else >>>> + vlan->numvtaps--; >>>> >>>> - sock_put(&q->sk); >>>> - --vlan->numvtaps; >>>> } >>>> >>>> spin_unlock(&macvtap_lock); >>>> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct >>>> net_device *dev, >>>> if (!numvtaps) >>>> goto out; >>>> >>>> + /* Check taps after numvtaps were exposed. */ >>>> + rmb(); >>>> + >>> Except this doesn't seem to handle case where taps are going away ... >> We're protected by rcu read lock here so even though it choose the queue >> which is going to be destroyed temporarily, the socket won't be freed >> before the packets were queued in the socket. >>>> /* Check if we can use flow to select a queue */ >>>> rxq = skb_get_rxhash(skb); >>>> if (rxq) { >>>> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev) >>>> >>>> /* macvtap_put_queue can free some slots, so go through all slots */ >>>> spin_lock(&macvtap_lock); >>>> - for (i = 0; i < vlan->numvtaps; i++) { >>>> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) { >>>> q = rcu_dereference_protected(vlan->taps[i], >>>> lockdep_is_held(&macvtap_lock)); >>>> BUG_ON(q == NULL); >>>> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev) >>>> } >>>> /* guarantee that any future macvtap_set_queue will fail */ >>>> vlan->numvtaps = MAX_MACVTAP_QUEUES; >>>> + vlan->numdisabled = 0; >>>> spin_unlock(&macvtap_lock); >>>> >>>> synchronize_rcu(); >>>> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct >>>> ifreq __user *ifr_u) >>>> return 0; >>>> } >>>> >>>> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags) >>>> +{ >>>> + struct macvtap_queue *q = file->private_data; >>>> + struct macvlan_dev *vlan; >>>> + int ret = -EINVAL; >>>> + >>>> + vlan = macvtap_get_vlan(q); >>>> + if (!vlan) >>>> + goto done; >>>> + >>>> + if (flags & IFF_ATTACH_QUEUE) >>>> + ret = macvtap_enable_queue(vlan->dev, file, q); >>>> + else if (flags & IFF_DETACH_QUEUE) >>>> + ret = macvtap_disable_queue(q); >>>> + >>>> + macvtap_put_vlan(vlan); >>>> + >>>> +done: >>>> + return ret; >>>> +} >>>> + >>>> /* >>>> * provide compatibility with generic tun/tap interface >>>> */ >>>> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, >>>> unsigned int cmd, >>>> macvtap_put_vlan(vlan); >>>> return ret; >>>> >>>> + case TUNSETQUEUE: >>>> + if (get_user(u, &ifr->ifr_flags)) >>>> + return -EFAULT; >>>> + return macvtap_ioctl_set_queue(file, u); >>>> + >>>> case TUNGETFEATURES: >>>> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up)) >>>> return -EFAULT; >>>> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h >>>> index 62d8bda..d528f38 100644 >>>> --- a/include/linux/if_macvlan.h >>>> +++ b/include/linux/if_macvlan.h >>>> @@ -69,8 +69,15 @@ struct macvlan_dev { >>>> u16 flags; >>>> int (*receive)(struct sk_buff *skb); >>>> int (*forward)(struct net_device *dev, struct sk_buff *skb); >>>> + /* This array tracks all taps (include disabled ones) and will be >>>> + * reshuffled to keep the following order: >>>> + * [0, numvtaps) : enabled taps, >>>> + * [numvtaps, numvtaps + numdisabled) : disabled taps, >>>> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots >>>> + */ >>>> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES]; >>>> int numvtaps; >>>> + int numdisabled; >>>> int minor; >>>> }; >>>> >>>> -- >>>> 1.7.1 >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in >>> the body of a message to majord...@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> Please read the FAQ at http://www.tux.org/lkml/ > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/