[PATCH vhost] virtio_ring: add BUG_ON() when vq is set to premapped mode and vq is not empty

2023-10-19 Thread Xuan Zhuo
Add BUG_ON check to make sure virtqueue_set_dma_premapped() is not
misused. This function must be called immediately after creating the vq,
or after vq reset, and before adding any buffers to it.

Signed-off-by: Xuan Zhuo 
---
 drivers/virtio/virtio_ring.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 51d8f3299c10..3a3232272067 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2793,10 +2793,7 @@ int virtqueue_set_dma_premapped(struct virtqueue *_vq)
 
num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
 
-   if (num != vq->vq.num_free) {
-   END_USE(vq);
-   return -EINVAL;
-   }
+   BUG_ON(num != vq->vq.num_free);
 
if (!vq->use_dma_api) {
END_USE(vq);
-- 
2.32.0.3.g01195cf9f

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 13/19] virtio_net: xsk: tx: virtnet_free_old_xmit() distinguishes xsk buffer

2023-10-19 Thread Xuan Zhuo
On Thu, 19 Oct 2023 02:38:16 -0400, "Michael S. Tsirkin"  
wrote:
> On Tue, Oct 17, 2023 at 10:02:05AM +0800, Xuan Zhuo wrote:
> > On Mon, 16 Oct 2023 16:44:34 -0700, Jakub Kicinski  wrote:
> > > On Mon, 16 Oct 2023 20:00:27 +0800 Xuan Zhuo wrote:
> > > > @@ -305,9 +311,15 @@ static inline void virtnet_free_old_xmit(struct 
> > > > virtnet_sq *sq, bool in_napi,
> > > >
> > > > stats->bytes += xdp_get_frame_len(frame);
> > > > xdp_return_frame(frame);
> > > > +   } else {
> > > > +   stats->bytes += virtnet_ptr_to_xsk(ptr);
> > > > +   ++xsknum;
> > > > }
> > > > stats->packets++;
> > > > }
> > > > +
> > > > +   if (xsknum)
> > > > +   xsk_tx_completed(sq->xsk.pool, xsknum);
> > > >  }
> > >
> > > sparse complains:
> > >
> > > drivers/net/virtio/virtio_net.h:322:41: warning: incorrect type in 
> > > argument 1 (different address spaces)
> > > drivers/net/virtio/virtio_net.h:322:41:expected struct xsk_buff_pool 
> > > *pool
> > > drivers/net/virtio/virtio_net.h:322:41:got struct xsk_buff_pool
> > > [noderef] __rcu *pool
> > >
> > > please build test with W=1 C=1
> >
> > OK. I will add C=1 to may script.
> >
> > Thanks.
>
> And I hope we all understand, rcu has to be used properly it's not just
> about casting the warning away.


Yes. I see. I will use rcu_dereference() and rcu_read_xxx().

Thanks.

>
> --
> MST
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v1 04/19] virtio_net: move to virtio_net.h

2023-10-19 Thread Xuan Zhuo
On Thu, 19 Oct 2023 14:12:55 +0800, Jason Wang  wrote:
> On Mon, Oct 16, 2023 at 8:00 PM Xuan Zhuo  wrote:
> >
> > Move some structure definitions and inline functions into the
> > virtio_net.h file.
>
> Some of the functions are not inline one before the moving. I'm not
> sure what's the criteria to choose the function to be moved.


That will used by xsk.c or other funcions in headers in the subsequence
commits.

If you are confused, I can try move the function when that is needed.
This commit just move some important structures.

Thanks.

>
>
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  drivers/net/virtio/main.c   | 252 +--
> >  drivers/net/virtio/virtio_net.h | 256 
> >  2 files changed, 258 insertions(+), 250 deletions(-)
> >  create mode 100644 drivers/net/virtio/virtio_net.h
> >
> > diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> > index 6cf77b6acdab..d8b6c0d86f29 100644
> > --- a/drivers/net/virtio/main.c
> > +++ b/drivers/net/virtio/main.c
> > @@ -6,7 +6,6 @@
> >  //#define DEBUG
> >  #include 
> >  #include 
> > -#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -16,7 +15,6 @@
> >  #include 
> >  #include 
> >  #include 
> > -#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -24,6 +22,8 @@
> >  #include 
> >  #include 
> >
> > +#include "virtio_net.h"
> > +
> >  static int napi_weight = NAPI_POLL_WEIGHT;
> >  module_param(napi_weight, int, 0444);
> >
> > @@ -45,15 +45,6 @@ module_param(napi_tx, bool, 0644);
> >  #define VIRTIO_XDP_TX  BIT(0)
> >  #define VIRTIO_XDP_REDIR   BIT(1)
> >
> > -#define VIRTIO_XDP_FLAGBIT(0)
> > -
> > -/* RX packet size EWMA. The average packet size is used to determine the 
> > packet
> > - * buffer size when refilling RX rings. As the entire RX ring may be 
> > refilled
> > - * at once, the weight is chosen so that the EWMA will be insensitive to 
> > short-
> > - * term, transient changes in packet size.
> > - */
> > -DECLARE_EWMA(pkt_len, 0, 64)
> > -
> >  #define VIRTNET_DRIVER_VERSION "1.0.0"
> >
> >  static const unsigned long guest_offloads[] = {
> > @@ -74,36 +65,6 @@ static const unsigned long guest_offloads[] = {
> > (1ULL << VIRTIO_NET_F_GUEST_USO4) | \
> > (1ULL << VIRTIO_NET_F_GUEST_USO6))
> >
> > -struct virtnet_stat_desc {
> > -   char desc[ETH_GSTRING_LEN];
> > -   size_t offset;
> > -};
> > -
> > -struct virtnet_sq_stats {
> > -   struct u64_stats_sync syncp;
> > -   u64 packets;
> > -   u64 bytes;
> > -   u64 xdp_tx;
> > -   u64 xdp_tx_drops;
> > -   u64 kicks;
> > -   u64 tx_timeouts;
> > -};
> > -
> > -struct virtnet_rq_stats {
> > -   struct u64_stats_sync syncp;
> > -   u64 packets;
> > -   u64 bytes;
> > -   u64 drops;
> > -   u64 xdp_packets;
> > -   u64 xdp_tx;
> > -   u64 xdp_redirects;
> > -   u64 xdp_drops;
> > -   u64 kicks;
> > -};
> > -
> > -#define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m)
> > -#define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stats, m)
> > -
> >  static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
> > { "packets",VIRTNET_SQ_STAT(packets) },
> > { "bytes",  VIRTNET_SQ_STAT(bytes) },
> > @@ -127,80 +88,6 @@ static const struct virtnet_stat_desc 
> > virtnet_rq_stats_desc[] = {
> >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> >
> > -struct virtnet_interrupt_coalesce {
> > -   u32 max_packets;
> > -   u32 max_usecs;
> > -};
> > -
> > -/* The dma information of pages allocated at a time. */
> > -struct virtnet_rq_dma {
> > -   dma_addr_t addr;
> > -   u32 ref;
> > -   u16 len;
> > -   u16 need_sync;
> > -};
> > -
> > -/* Internal representation of a send virtqueue */
> > -struct send_queue {
> > -   /* Virtqueue associated with this send _queue */
> > -   struct virtqueue *vq;
> > -
> > -   /* TX: fragments + linear part + virtio header */
> > -   struct scatterlist sg[MAX_SKB_FRAGS + 2];
> > -
> > -   /* Name of the send queue: output.$index */
> > -   char name[16];
> > -
> > -   struct virtnet_sq_stats stats;
> > -
> > -   struct virtnet_interrupt_coalesce intr_coal;
> > -
> > -   struct napi_struct napi;
> > -
> > -   /* Record whether sq is in reset state. */
> > -   bool reset;
> > -};
> > -
> > -/* Internal representation of a receive virtqueue */
> > -struct receive_queue {
> > -   /* Virtqueue associated with this receive_queue */
> > -   struct virtqueue *vq;
> > -
> > -   struct napi_struct napi;
> > -
> > -   struct bpf_prog __rcu *xdp_prog;
> > -
> > -   struct virtnet_rq_stats stats;
> > -
> > -   struct virtnet_interrupt_coalesce intr_coal;
> > -
> > -   /* Chain pages by the private 

Re: [PATCH v5 9/9] drm: ci: Update xfails

2023-10-19 Thread Daniel Stone
Hi Vignesh,

On Thu, 19 Oct 2023 at 09:07, Vignesh Raman  wrote:
> +# Some tests crashes with malloc error and IGT tests floods
> +# the CI log with error messages and we end up with a warning message
> +# Job's log exceeded limit of 4194304 bytes.
> +# Job execution will continue but no more output will be collected.

This is just from GitLab warning that we have a huge log, so not
related to the actual fails here.

> +# Below is the error log:
> +# malloc(): corrupted top size
> +# Received signal SIGABRT.
> +# Stack trace:
> +#  #0 [fatal_sig_handler+0x17b]
> +#  #1 [__sigaction+0x40]
> +#  #2 [pthread_key_delete+0x14c]
> +#  #3 [gsignal+0x12]
> +#  #4 [abort+0xd3]
> +#  #5 [__fsetlocking+0x290]
> +#  #6 [timer_settime+0x37a]
> +#  #7 [__default_morecore+0x1f1b]
> +#  #8 [__libc_calloc+0x161]
> +#  #9 [drmModeGetPlaneResources+0x44]
> +#  #10 [igt_display_require+0x194]
> +#  #11 [__igt_uniquereal_main1356+0x93c]
> +#  #12 [main+0x3f]
> +#  #13 [__libc_init_first+0x8a]
> +#  #14 [__libc_start_main+0x85]
> +#  #15 [_start+0x21]
> +# malloc(): corrupted top size

By the time we get this error, it indicates that there was previously
memory corruption, but it is only being noticed at a later point. The
skip lists here are way too big - stuff like drm_read is common
testing not affected by virtio at all - so we really need to isolate
the test which is actually breaking things.

The way to do this would be to use valgrind to detect where the memory
corruption is. VirtIO can be run locally so this is something you can
do on your machine.

Cheers,
Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] vhost-vdpa: reset vendor specific mapping to initial state in .release

2023-10-19 Thread Jason Wang
On Thu, Oct 19, 2023 at 2:47 PM Si-Wei Liu  wrote:
>
>
>
> On 10/18/2023 7:53 PM, Jason Wang wrote:
> > On Wed, Oct 18, 2023 at 4:49 PM Si-Wei Liu  wrote:
> >>
> >>
> >> On 10/18/2023 12:00 AM, Jason Wang wrote:
>  Unfortunately, it's a must to stick to ABI. I agree it's a mess but we
>  don't have a better choice. Or we can fail the probe if userspace
>  doesn't ack this feature.
> >>> Antoher idea we can just do the following in vhost_vdpa reset?
> >>>
> >>> config->reset()
> >>> if (IOTLB_PERSIST is not set) {
> >>>   config->reset_map()
> >>> }
> >>>
> >>> Then we don't have the burden to maintain them in the parent?
> >>>
> >>> Thanks
> >> Please see my earlier response in the other email, thanks.
> >>
> >> %<%<
> >>
> >> First, the ideal fix would be to leave this reset_vendor_mappings()
> >> emulation code on the individual driver itself, which already has the
> >> broken behavior.
> > So the point is, not about whether the existing behavior is "broken"
> > or not.
> Hold on, I thought earlier we all agreed upon that the existing behavior
> of vendor driver self-clearing maps during .reset violates the vhost
> iotlb abstraction and also breaks the .set_map/.dma_map API. This is
> 100% buggy driver implementation itself that we should discourage or
> eliminate as much as possible (that's part of the goal for this series),

I'm not saying it's not an issue, what I'm saying is, if the fix
breaks another userspace, it's a new bug in the kernel. See what Linus
said in [1]

"If a change results in user programs breaking, it's a bug in the kernel."

> but here you seem to go existentialism and suggests the very opposite
> that every .set_map/.dma_map driver implementation, regardless being the
> current or the new/upcoming, should unconditionally try to emulate the
> broken reset behavior for the sake of not breaking older userspace.

Such "emulation" is not done at the parent level. New parents just
need to implement reset_map() or not. everything could be done inside
vhost-vDPA as pseudo code that is shown above.

> Set
> aside the criteria and definition for how userspace can be broken, can
> we step back to the original question why we think it's broken, and what
> we can do to promote good driver implementation instead of discuss the
> implementation details?

I'm not sure I get the point of this question. I'm not saying we don't
need to fix, what I am saying is that such a fix must be done in a
negotiable way. And it's better if parents won't get any burden. It
can just decide to implement reset_map() or not.

> Reading the below response I found my major
> points are not heard even if written for quite a few times.

I try my best to not ignore any important things, but I can't promise
I will not miss any. I hope the above clarifies my points.

> It's not
> that I don't understand the importance of not breaking old userspace, I
> appreciate your questions and extra patience, however I do feel the
> "broken" part is very relevant to our discussion here.
> If it's broken (in the sense of vhost IOTLB API) that you agree, I think
> we should at least allow good driver implementations; and when you think
> about the possibility of those valid good driver cases
> (.set_map/.dma_map implementations that do not clear maps in .reset),
> you might be able to see why it's coded the way as it is now.
>
> >   It's about whether we could stick to the old behaviour without
> > too much cost. And I believe we could.
> >
> > And just to clarify here, reset_vendor_mappings() = config->reset_map()
> >
> >> But today there's no backend feature negotiation
> >> between vhost-vdpa and the parent driver. Do we want to send down the
> >> acked_backend_features to parent drivers?
> > There's no need to do that with the above code, or anything I missed here?
> >
> > config->reset()
> > if (IOTLB_PERSIST is not set) {
> >config->reset_map()
> > }
> Implementation issue: this implies reset_map() has to be there for every
> .set_map implementations, but vendor driver implementation for custom
> IOMMU could well implement DMA ops by itself instead of .reset_map. This
> won't work for every set_map driver (think about the vduse case).

Well let me do it once again, reset_map() is not mandated:

config->reset()
if (IOTLB_PERSIST is not set) {
if (config->reset_map)
  config->reset_map()
}

Did you see any issue with VDUSE in this case?

>
> But this is not the the point I was making. I think if you agree this is
> purely buggy driver implementation of its own, we should try to isolate
> this buggy behavior to individual driver rather than overload vhost-vdpa
> or vdpa core's role to help implement the emulation of broken driver
> behavior.

As I pointed out, if it is not noticeable in the userspace, that's
fine but it's not.

> I don't get why .reset is special here, the abuse of .reset to
> manipulate mapping could also happen in other IOMMU unrela

Re: [PATCH net-next v1 13/19] virtio_net: xsk: tx: virtnet_free_old_xmit() distinguishes xsk buffer

2023-10-19 Thread Michael S. Tsirkin
On Thu, Oct 19, 2023 at 03:13:48PM +0800, Xuan Zhuo wrote:
> On Thu, 19 Oct 2023 02:38:16 -0400, "Michael S. Tsirkin"  
> wrote:
> > On Tue, Oct 17, 2023 at 10:02:05AM +0800, Xuan Zhuo wrote:
> > > On Mon, 16 Oct 2023 16:44:34 -0700, Jakub Kicinski  
> > > wrote:
> > > > On Mon, 16 Oct 2023 20:00:27 +0800 Xuan Zhuo wrote:
> > > > > @@ -305,9 +311,15 @@ static inline void virtnet_free_old_xmit(struct 
> > > > > virtnet_sq *sq, bool in_napi,
> > > > >
> > > > >   stats->bytes += xdp_get_frame_len(frame);
> > > > >   xdp_return_frame(frame);
> > > > > + } else {
> > > > > + stats->bytes += virtnet_ptr_to_xsk(ptr);
> > > > > + ++xsknum;
> > > > >   }
> > > > >   stats->packets++;
> > > > >   }
> > > > > +
> > > > > + if (xsknum)
> > > > > + xsk_tx_completed(sq->xsk.pool, xsknum);
> > > > >  }
> > > >
> > > > sparse complains:
> > > >
> > > > drivers/net/virtio/virtio_net.h:322:41: warning: incorrect type in 
> > > > argument 1 (different address spaces)
> > > > drivers/net/virtio/virtio_net.h:322:41:expected struct 
> > > > xsk_buff_pool *pool
> > > > drivers/net/virtio/virtio_net.h:322:41:got struct xsk_buff_pool
> > > > [noderef] __rcu *pool
> > > >
> > > > please build test with W=1 C=1
> > >
> > > OK. I will add C=1 to may script.
> > >
> > > Thanks.
> >
> > And I hope we all understand, rcu has to be used properly it's not just
> > about casting the warning away.
> 
> 
> Yes. I see. I will use rcu_dereference() and rcu_read_xxx().
> 
> Thanks.

When you do, pls don't forget to add comments documenting what does
rcu_read_lock and synchronize_rcu.


-- 
MST

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vsock: initialize the_virtio_vsock before using VQs

2023-10-19 Thread Stefano Garzarella

On Wed, Oct 18, 2023 at 09:32:47PM +0300, Alexandru Matei wrote:

Once VQs are filled with empty buffers and we kick the host, it can send
connection requests. If 'the_virtio_vsock' is not initialized before,
replies are silently dropped and do not reach the host.


Are replies really dropped or we just miss the notification?

Could the reverse now happen, i.e., the guest wants to send a connection 
request, finds the pointer assigned but can't use virtqueues because 
they haven't been initialized yet?


Perhaps to avoid your problem, we could just queue vsock->rx_work at the 
bottom of the probe to see if anything was queued in the meantime.


Nit: please use "vsock/virtio" to point out that this problem is of the 
virtio transport.


Thanks,
Stefano



Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on 
the_virtio_vsock")
Signed-off-by: Alexandru Matei 
---
net/vmw_vsock/virtio_transport.c | 7 ---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..eae0867133f8 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -658,12 +658,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->seqpacket_allow = true;

vdev->priv = vsock;
+   rcu_assign_pointer(the_virtio_vsock, vsock);

ret = virtio_vsock_vqs_init(vsock);
-   if (ret < 0)
+   if (ret < 0) {
+   rcu_assign_pointer(the_virtio_vsock, NULL);
goto out;
-
-   rcu_assign_pointer(the_virtio_vsock, vsock);
+   }

mutex_unlock(&the_virtio_vsock_mutex);

--
2.34.1




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3 1/5] x86/paravirt: move some functions and defines to alternative

2023-10-19 Thread Juergen Gross via Virtualization
As a preparation for replacing paravirt patching completely by
alternative patching, move some backend functions and #defines to
alternative code and header.

Signed-off-by: Juergen Gross 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/x86/include/asm/alternative.h| 16 
 arch/x86/include/asm/paravirt.h   | 12 -
 arch/x86/include/asm/paravirt_types.h |  4 +--
 arch/x86/include/asm/qspinlock_paravirt.h |  4 +--
 arch/x86/kernel/alternative.c | 10 
 arch/x86/kernel/kvm.c |  4 +--
 arch/x86/kernel/paravirt.c| 30 +++
 arch/x86/xen/irq.c|  2 +-
 8 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 9c4da699e11a..67e50bd40bb8 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -330,6 +330,22 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
  */
 #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
 
+/* Macro for creating assembler functions avoiding any C magic. */
+#define DEFINE_ASM_FUNC(func, instr, sec)  \
+   asm (".pushsection " #sec ", \"ax\"\n"  \
+".global " #func "\n\t"\
+".type " #func ", @function\n\t"   \
+ASM_FUNC_ALIGN "\n"\
+#func ":\n\t"  \
+ASM_ENDBR  \
+instr "\n\t"   \
+ASM_RET\
+".size " #func ", . - " #func "\n\t"   \
+".popsection")
+
+void x86_BUG(void);
+void x86_nop(void);
+
 #else /* __ASSEMBLY__ */
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6c8ff12140ae..ed5c7342f2ef 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -726,18 +726,6 @@ static __always_inline unsigned long 
arch_local_irq_save(void)
 #undef PVOP_VCALL4
 #undef PVOP_CALL4
 
-#define DEFINE_PARAVIRT_ASM(func, instr, sec)  \
-   asm (".pushsection " #sec ", \"ax\"\n"  \
-".global " #func "\n\t"\
-".type " #func ", @function\n\t"   \
-ASM_FUNC_ALIGN "\n"\
-#func ":\n\t"  \
-ASM_ENDBR  \
-instr "\n\t"   \
-ASM_RET\
-".size " #func ", . - " #func "\n\t"   \
-".popsection")
-
 extern void default_banner(void);
 void native_pv_lock_init(void) __init;
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 772d03487520..9f84dc074f88 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -542,8 +542,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),\
 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
 
-void _paravirt_nop(void);
-void paravirt_BUG(void);
 unsigned long paravirt_ret0(void);
 #ifdef CONFIG_PARAVIRT_XXL
 u64 _paravirt_ident_64(u64);
@@ -553,7 +551,7 @@ void pv_native_irq_enable(void);
 unsigned long pv_native_read_cr2(void);
 #endif
 
-#define paravirt_nop   ((void *)_paravirt_nop)
+#define paravirt_nop   ((void *)x86_nop)
 
 extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h 
b/arch/x86/include/asm/qspinlock_paravirt.h
index 85b6e3609cb9..ef9697f20129 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, 
".spinlock.text");
"pop%rdx\n\t"   \
FRAME_END
 
-DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock,
-   PV_UNLOCK_ASM, .spinlock.text);
+DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
+   PV_UNLOCK_ASM, .spinlock.text);
 
 #else /* CONFIG_64BIT */
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..1c8dd8e05f3f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -385,6 +385,16 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, 
size_t src_len)
}
 }
 
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(x86_nop, "", .entry.text);
+EXPORT_SYMBOL_GPL(x86_nop);
+
+noinstr void x86_BUG(void)
+{
+   BUG();
+}
+EXPORT_SYMBOL_GPL(x86_BUG);
+
 /*
  * Replace instructions with better alternatives for this

[PATCH v3 0/5] x86/paravirt: Get rid of paravirt patching

2023-10-19 Thread Juergen Gross via Virtualization
This is a small series getting rid of paravirt patching by switching
completely to alternative patching for the same functionality.

The basic idea is to add the capability to switch from indirect to
direct calls via a special alternative patching option.

This removes _some_ of the paravirt macro maze, but most of it needs
to stay due to the need of hiding the call instructions from the
compiler in order to avoid needless register save/restore.

What is going away is the nasty stacking of alternative and paravirt
patching and (of course) the special .parainstructions linker section.

I have tested the series on bare metal and as Xen PV domain to still
work.

Note that objtool might need some changes to cope with the new
indirect call patching mechanism. Additionally some paravirt handling
can probably be removed from it.

Changes in V3:
- split v2 patch 3 into 2 patches as requested by Peter and Ingo

Changes in V2:
- split last patch into 2
- rebase of patch 2 as suggested by Peter
- addressed Peter's comments for patch 3

Juergen Gross (5):
  x86/paravirt: move some functions and defines to alternative
  x86/alternative: add indirect call patching
  x86/paravirt: introduce ALT_NOT_XEN
  x86/paravirt: switch mixed paravirt/alternative calls to alternative_2
  x86/paravirt: remove no longer needed paravirt patching code

 arch/x86/include/asm/alternative.h|  26 -
 arch/x86/include/asm/paravirt.h   |  79 +--
 arch/x86/include/asm/paravirt_types.h |  73 +++---
 arch/x86/include/asm/qspinlock_paravirt.h |   4 +-
 arch/x86/include/asm/text-patching.h  |  12 ---
 arch/x86/kernel/alternative.c | 116 ++
 arch/x86/kernel/callthunks.c  |  17 ++--
 arch/x86/kernel/kvm.c |   4 +-
 arch/x86/kernel/module.c  |  20 +---
 arch/x86/kernel/paravirt.c|  54 ++
 arch/x86/kernel/vmlinux.lds.S |  13 ---
 arch/x86/tools/relocs.c   |   2 +-
 arch/x86/xen/irq.c|   2 +-
 13 files changed, 137 insertions(+), 285 deletions(-)

-- 
2.35.3

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3 3/5] x86/paravirt: introduce ALT_NOT_XEN

2023-10-19 Thread Juergen Gross via Virtualization
Introduce the macro ALT_NOT_XEN as a short form of
ALT_NOT(X86_FEATURE_XENPV).

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Juergen Gross 
---
V3:
- split off from next patch
---
 arch/x86/include/asm/paravirt.h   | 42 ---
 arch/x86/include/asm/paravirt_types.h |  3 ++
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ed5c7342f2ef..3749311d51c3 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -142,8 +142,7 @@ static inline void write_cr0(unsigned long x)
 static __always_inline unsigned long read_cr2(void)
 {
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
-   "mov %%cr2, %%rax;",
-   ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%cr2, %%rax;", ALT_NOT_XEN);
 }
 
 static __always_inline void write_cr2(unsigned long x)
@@ -154,13 +153,12 @@ static __always_inline void write_cr2(unsigned long x)
 static inline unsigned long __read_cr3(void)
 {
return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr3, %%rax;", ALT_NOT_XEN);
 }
 
 static inline void write_cr3(unsigned long x)
 {
-   PVOP_ALT_VCALL1(mmu.write_cr3, x,
-   "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+   PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
 }
 
 static inline void __write_cr4(unsigned long x)
@@ -182,7 +180,7 @@ extern noinstr void pv_native_wbinvd(void);
 
 static __always_inline void wbinvd(void)
 {
-   PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
+   PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
 }
 
 static inline u64 paravirt_read_msr(unsigned msr)
@@ -390,27 +388,25 @@ static inline void paravirt_release_p4d(unsigned long pfn)
 static inline pte_t __pte(pteval_t val)
 {
return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
 }
 
 static inline pteval_t pte_val(pte_t pte)
 {
return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 static inline pgd_t __pgd(pgdval_t val)
 {
return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
 }
 
 static inline pgdval_t pgd_val(pgd_t pgd)
 {
return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 #define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -444,14 +440,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline pmd_t __pmd(pmdval_t val)
 {
return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
 }
 
 static inline pmdval_t pmd_val(pmd_t pmd)
 {
return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -464,7 +459,7 @@ static inline pud_t __pud(pudval_t val)
pudval_t ret;
 
ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
-  "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+  "mov %%rdi, %%rax", ALT_NOT_XEN);
 
return (pud_t) { ret };
 }
@@ -472,7 +467,7 @@ static inline pud_t __pud(pudval_t val)
 static inline pudval_t pud_val(pud_t pud)
 {
return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 static inline void pud_clear(pud_t *pudp)
@@ -492,8 +487,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 static inline p4d_t __p4d(p4dval_t val)
 {
p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
-   "mov %%rdi, %%rax",
-   ALT_NOT(X86_FEATURE_XENPV));
+   "

[PATCH v3 4/5] x86/paravirt: switch mixed paravirt/alternative calls to alternative_2

2023-10-19 Thread Juergen Gross via Virtualization
Instead of stacking alternative and paravirt patching, use the new
ALT_FLAG_CALL flag to switch those mixed calls to pure alternative
handling.

This eliminates the need to be careful regarding the sequence of
alternative and paravirt patching.

For call depth tracking callthunks_setup() needs to be adapted to patch
calls at alternative patching sites instead of paravirt calls.

Signed-off-by: Juergen Gross 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/x86/include/asm/alternative.h|  5 +++--
 arch/x86/include/asm/paravirt.h   |  9 ++---
 arch/x86/include/asm/paravirt_types.h | 26 +-
 arch/x86/kernel/callthunks.c  | 17 -
 arch/x86/kernel/module.c  | 20 +---
 5 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index dd63b96577e9..f6c0ff678e2e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -89,6 +89,8 @@ struct alt_instr {
u8  replacementlen; /* length of new instruction */
 } __packed;
 
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
 /*
  * Debug flag that can be tested to see whether alternative
  * instructions were patched in already:
@@ -104,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 
*end_retpoine,
  s32 *start_cfi, s32 *end_cfi);
 
 struct module;
-struct paravirt_patch_site;
 
 struct callthunk_sites {
s32 *call_start, *call_end;
-   struct paravirt_patch_site  *pv_start, *pv_end;
+   struct alt_instr*alt_start, *alt_end;
 };
 
 #ifdef CONFIG_CALL_THUNKS
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 3749311d51c3..9c6c5cfa9fe2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -740,20 +740,23 @@ void native_pv_lock_init(void) __init;
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
 
 #define PARA_PATCH(off)((off) / 8)
 #define PARA_SITE(ptype, ops)  _PVSITE(ptype, ops, .quad, 8)
 #define PARA_INDIRECT(addr)*addr(%rip)
 
-#ifdef CONFIG_DEBUG_ENTRY
 .macro PARA_IRQ_save_fl
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
  ANNOTATE_RETPOLINE_SAFE;
  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+   ANNOTATE_RETPOLINE_SAFE;
+   call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
 .endm
 
-#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
-   ALT_NOT_XEN
+#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;",  \
+ALT_CALL_INSTR, ALT_CALL_ALWAYS,   \
+"pushf; pop %rax;", ALT_NOT_XEN
 #endif
 #endif /* CONFIG_PARAVIRT_XXL */
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index e99db1360d2a..323dca625eea 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -278,15 +278,11 @@ extern struct paravirt_patch_template pv_ops;
 #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
 
 unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, 
unsigned int len);
+#define paravirt_ptr(op)   [paravirt_opptr] "m" (pv_ops.op)
 
 int paravirt_disable_iospace(void);
 
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
+/* This generates an indirect call based on the operation type number. */
 #define PARAVIRT_CALL  \
ANNOTATE_RETPOLINE_SAFE \
"call *%[paravirt_opptr];"
@@ -319,12 +315,6 @@ int paravirt_disable_iospace(void);
  * However, x86_64 also has to clobber all caller saved registers, which
  * unfortunately, are quite a bit (r8 - r11)
  *
- * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
- * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend pv_init_ops
- * implementation.
- *
  * Unfortunately there's no way to get gcc to generate the args setup
  * for the call, and then allow the call itself to be generated by an
  * inline asm.  Because of this, we must do the complete arg setup and
@@ -428,9 +418,10 @@ int paravirt_disable_iospace(void);
({  \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
-   

[PATCH v3 5/5] x86/paravirt: remove no longer needed paravirt patching code

2023-10-19 Thread Juergen Gross via Virtualization
Now that paravirt is using the alternatives patching infrastructure,
remove the paravirt patching code.

Signed-off-by: Juergen Gross 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/x86/include/asm/paravirt.h   | 18 
 arch/x86/include/asm/paravirt_types.h | 40 
 arch/x86/include/asm/text-patching.h  | 12 -
 arch/x86/kernel/alternative.c | 66 +--
 arch/x86/kernel/paravirt.c| 30 
 arch/x86/kernel/vmlinux.lds.S | 13 --
 arch/x86/tools/relocs.c   |  2 +-
 7 files changed, 3 insertions(+), 178 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9c6c5cfa9fe2..f09acce9432c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -725,31 +725,13 @@ void native_pv_lock_init(void) __init;
 
 #else  /* __ASSEMBLY__ */
 
-#define _PVSITE(ptype, ops, word, algn)\
-771:;  \
-   ops;\
-772:;  \
-   .pushsection .parainstructions,"a"; \
-.align algn;   \
-word 771b; \
-.byte ptype;   \
-.byte 772b-771b;   \
-_ASM_ALIGN;\
-   .popsection
-
-
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT_XXL
 #ifdef CONFIG_DEBUG_ENTRY
 
-#define PARA_PATCH(off)((off) / 8)
-#define PARA_SITE(ptype, ops)  _PVSITE(ptype, ops, .quad, 8)
 #define PARA_INDIRECT(addr)*addr(%rip)
 
 .macro PARA_IRQ_save_fl
-   PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
- ANNOTATE_RETPOLINE_SAFE;
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
ANNOTATE_RETPOLINE_SAFE;
call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
 .endm
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 323dca625eea..756cb75d22b5 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -2,15 +2,6 @@
 #ifndef _ASM_X86_PARAVIRT_TYPES_H
 #define _ASM_X86_PARAVIRT_TYPES_H
 
-#ifndef __ASSEMBLY__
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
-   u8 *instr;  /* original instructions */
-   u8 type;/* type of this instruction */
-   u8 len; /* length of original instruction */
-};
-#endif
-
 #ifdef CONFIG_PARAVIRT
 
 #ifndef __ASSEMBLY__
@@ -250,34 +241,6 @@ struct paravirt_patch_template {
 extern struct pv_info pv_info;
 extern struct paravirt_patch_template pv_ops;
 
-#define PARAVIRT_PATCH(x)  \
-   (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-
-#define paravirt_type(op)  \
-   [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),\
-   [paravirt_opptr] "m" (pv_ops.op)
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type)   \
-   "771:\n\t" insn_string "\n" "772:\n"\
-   ".pushsection .parainstructions,\"a\"\n"\
-   _ASM_ALIGN "\n" \
-   _ASM_PTR " 771b\n"  \
-   "  .byte " type "\n"\
-   "  .byte 772b-771b\n"   \
-   _ASM_ALIGN "\n" \
-   ".popsection\n"
-
-/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string)  \
-   _paravirt_alt(insn_string, "%c[paravirt_typenum]")
-
-/* Simple instruction patching code. */
-#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, 
unsigned int len);
 #define paravirt_ptr(op)   [paravirt_opptr] "m" (pv_ops.op)
 
 int paravirt_disable_iospace(void);
@@ -545,9 +508,6 @@ unsigned long pv_native_read_cr2(void);
 
 #define paravirt_nop   ((void *)x86_nop)
 
-extern struct paravirt_patch_site __parainstructions[],
-   __parainstructions_end[];
-
 #endif /* __ASSEMBLY__ */
 
 #define ALT_NOT_XENALT_NOT(X86_FEATURE_XENPV)
diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index 29832c338cdc..0b70653a98c1 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -6,18 +6,6 @@
 #include 
 #include 
 
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
-   struct paravirt_patch_site *end);
-#else
-static inline void apply_paravirt(struct paravirt_patch_site *start,
- 

Re: [RFC v2 PATCH] vdpa_sim: implement .reset_map support

2023-10-19 Thread Stefano Garzarella

On Wed, Oct 18, 2023 at 04:47:48PM -0700, Si-Wei Liu wrote:



On 10/18/2023 1:05 AM, Stefano Garzarella wrote:

On Tue, Oct 17, 2023 at 10:11:33PM -0700, Si-Wei Liu wrote:

RFC only. Not tested on vdpa-sim-blk with user virtual address.
Works fine with vdpa-sim-net which uses physical address to map.

This patch is based on top of [1].

[1] 
https://lore.kernel.org/virtualization/1696928580-7520-1-git-send-email-si-wei@oracle.com/

Signed-off-by: Si-Wei Liu 

---
RFC v2:
 - initialize iotlb to passthrough mode in device add


I tested this version and I didn't see any issue ;-)

Great, thank you so much for your help on testing my patch, Stefano!


You're welcome :-)

Just for my own interest/curiosity, currently there's no vhost-vdpa 
backend client implemented for vdpa-sim-blk


Yep, we developed libblkio [1]. libblkio exposes common API to access 
block devices in userspace. It supports several drivers.
The one useful for this use case is `virtio-blk-vhost-vdpa`. Here [2] 
some examples on how to use the libblkio test suite with the 
vdpa-sim-blk.


Since QEMU 7.2, it supports libblkio drivers, so you can use the 
following options to attach a vdpa-blk device to a VM:


  -blockdev 
node-name=drive_src1,driver=virtio-blk-vhost-vdpa,path=/dev/vhost-vdpa-0,cache.direct=on
 \
  -device virtio-blk-pci,id=src1,bootindex=2,drive=drive_src1 \

For now only what we called slow-path [3][4] is supported, since the VQs 
are not directly exposed to the guest, but QEMU allocates other VQs 
(similar to shadow VQs for net) to support live-migration and QEMU 
storage features. Fast-path is on the agenda, but on pause for now.


or any vdpa block device in userspace as yet, correct? 


Do you mean with VDUSE?
In this case, yes, qemu-storage-daemon supports it, and can implement a 
virtio-blk in user space, exposing a disk image thorough VDUSE.


There is an example in libblkio as well [5] on how to start it.

So there was no test specific to vhost-vdpa that needs to be exercised, 
right?




I hope I answered above :-)
This reminded me that I need to write a blog post with all this 
information, I hope to do that soon!


Stefano

[1] https://gitlab.com/libblkio/libblkio
[2] 
https://gitlab.com/libblkio/libblkio/-/blob/main/tests/meson.build?ref_type=heads#L42
[3] 
https://kvmforum2022.sched.com/event/15jK5/qemu-storage-daemon-and-libblkio-exploring-new-shores-for-the-qemu-block-layer-kevin-wolf-stefano-garzarella-red-hat
[4] 
https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat
[5] 
https://gitlab.com/libblkio/libblkio/-/blob/main/tests/meson.build?ref_type=heads#L58

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


PING: [PATCH] virtio-blk: fix implicit overflow on virtio_max_dma_size

2023-10-19 Thread zhenwei pi via Virtualization

Hi Michael,

This seems to have been ignored as you suggested.

LINK: https://www.spinics.net/lists/linux-virtualization/msg63015.html

On 9/4/23 14:10, zhenwei pi wrote:

The following codes have an implicit conversion from size_t to u32:
(u32)max_size = (size_t)virtio_max_dma_size(vdev);

This may lead overflow, Ex (size_t)4G -> (u32)0. Once
virtio_max_dma_size() has a larger size than U32_MAX, use U32_MAX
instead.

Signed-off-by: zhenwei pi 
---
  drivers/block/virtio_blk.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1fe011676d07..4a4b9bad551e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1313,6 +1313,7 @@ static int virtblk_probe(struct virtio_device *vdev)
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
unsigned int queue_depth;
+   size_t max_dma_size;
  
  	if (!vdev->config->get) {

dev_err(&vdev->dev, "%s failure: config access disabled\n",
@@ -1411,7 +1412,8 @@ static int virtblk_probe(struct virtio_device *vdev)
/* No real sector limit. */
blk_queue_max_hw_sectors(q, UINT_MAX);
  
-	max_size = virtio_max_dma_size(vdev);

+   max_dma_size = virtio_max_dma_size(vdev);
+   max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
  
  	/* Host can optionally specify maximum segment size and number of

 * segments. */


--
zhenwei pi
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: PING: [PATCH] virtio-blk: fix implicit overflow on virtio_max_dma_size

2023-10-19 Thread Michael S. Tsirkin
On Thu, Oct 19, 2023 at 05:43:55PM +0800, zhenwei pi wrote:
> Hi Michael,
> 
> This seems to have been ignored as you suggested.
> 
> LINK: https://www.spinics.net/lists/linux-virtualization/msg63015.html

Pls Cc more widely then:

Paolo Bonzini  (reviewer:VIRTIO BLOCK AND SCSI DRIVERS)
Stefan Hajnoczi  (reviewer:VIRTIO BLOCK AND SCSI DRIVERS)
Xuan Zhuo  (reviewer:VIRTIO CORE AND NET DRIVERS)
Jens Axboe  (maintainer:BLOCK LAYER)
linux-bl...@vger.kernel.org (open list:BLOCK LAYER)

would all be good people to ask to review this.


> On 9/4/23 14:10, zhenwei pi wrote:
> > The following codes have an implicit conversion from size_t to u32:
> > (u32)max_size = (size_t)virtio_max_dma_size(vdev);
> > 
> > This may lead overflow, Ex (size_t)4G -> (u32)0. Once
> > virtio_max_dma_size() has a larger size than U32_MAX, use U32_MAX
> > instead.
> > 
> > Signed-off-by: zhenwei pi 
> > ---
> >   drivers/block/virtio_blk.c | 4 +++-
> >   1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> > index 1fe011676d07..4a4b9bad551e 100644
> > --- a/drivers/block/virtio_blk.c
> > +++ b/drivers/block/virtio_blk.c
> > @@ -1313,6 +1313,7 @@ static int virtblk_probe(struct virtio_device *vdev)
> > u16 min_io_size;
> > u8 physical_block_exp, alignment_offset;
> > unsigned int queue_depth;
> > +   size_t max_dma_size;
> > if (!vdev->config->get) {
> > dev_err(&vdev->dev, "%s failure: config access disabled\n",
> > @@ -1411,7 +1412,8 @@ static int virtblk_probe(struct virtio_device *vdev)
> > /* No real sector limit. */
> > blk_queue_max_hw_sectors(q, UINT_MAX);
> > -   max_size = virtio_max_dma_size(vdev);
> > +   max_dma_size = virtio_max_dma_size(vdev);
> > +   max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
> > /* Host can optionally specify maximum segment size and number of
> >  * segments. */
> 
> -- 
> zhenwei pi

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 1/2] virtio_pci: Don't make an extra copy of cpu affinity mask

2023-10-19 Thread Jakub Sitnicki via Virtualization
Since commit 19e226e8cc5d ("virtio: Make vp_set_vq_affinity() take a
mask.") it is actually not needed to have a local copy of the cpu mask.

Pass the cpu mask we got as argument to set the irq affinity hint.

Cc: Caleb Raitto 
Signed-off-by: Jakub Sitnicki 
---
 drivers/virtio/virtio_pci_common.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index c2524a7207cf..8927bc338f06 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -433,21 +433,14 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct 
cpumask *cpu_mask)
struct virtio_device *vdev = vq->vdev;
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
-   struct cpumask *mask;
unsigned int irq;
 
if (!vq->callback)
return -EINVAL;
 
if (vp_dev->msix_enabled) {
-   mask = vp_dev->msix_affinity_masks[info->msix_vector];
irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
-   if (!cpu_mask)
-   irq_set_affinity_hint(irq, NULL);
-   else {
-   cpumask_copy(mask, cpu_mask);
-   irq_set_affinity_hint(irq, mask);
-   }
+   irq_set_affinity_hint(irq, cpu_mask);
}
return 0;
 }
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 2/2] virtio_pci: Switch away from deprecated irq_set_affinity_hint

2023-10-19 Thread Jakub Sitnicki via Virtualization
Since commit 65c7cdedeb30 ("genirq: Provide new interfaces for affinity
hints") irq_set_affinity_hint is being phased out.

Switch to new interfaces for setting and applying irq affinity hints.

Signed-off-by: Jakub Sitnicki 
---
 drivers/virtio/virtio_pci_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 8927bc338f06..9fab99b540f1 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -242,7 +242,7 @@ void vp_del_vqs(struct virtio_device *vdev)
if (v != VIRTIO_MSI_NO_VECTOR) {
int irq = pci_irq_vector(vp_dev->pci_dev, v);
 
-   irq_set_affinity_hint(irq, NULL);
+   irq_update_affinity_hint(irq, NULL);
free_irq(irq, vq);
}
}
@@ -440,7 +440,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct 
cpumask *cpu_mask)
 
if (vp_dev->msix_enabled) {
irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
-   irq_set_affinity_hint(irq, cpu_mask);
+   irq_set_affinity_and_hint(irq, cpu_mask);
}
return 0;
 }
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 1/5] x86/paravirt: move some functions and defines to alternative

2023-10-19 Thread kernel test robot
Hi Juergen,

kernel test robot noticed the following build warnings:

[auto build test WARNING on kvm/queue]
[cannot apply to tip/x86/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Juergen-Gross/x86-paravirt-move-some-functions-and-defines-to-alternative/20231019-171709
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:
https://lore.kernel.org/r/20231019091520.14540-2-jgross%40suse.com
patch subject: [PATCH v3 1/5] x86/paravirt: move some functions and defines to 
alternative
reproduce: 
(https://download.01.org/0day-ci/archive/20231019/202310191944.z8sc9h8o-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202310191944.z8sc9h8o-...@intel.com/

# many are suggestions rather than must-fix

ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in 
parentheses
#32: FILE: arch/x86/include/asm/alternative.h:334:
+#define DEFINE_ASM_FUNC(func, instr, sec)  \
+   asm (".pushsection " #sec ", \"ax\"\n"  \
+".global " #func "\n\t"\
+".type " #func ", @function\n\t"   \
+ASM_FUNC_ALIGN "\n"\
+#func ":\n\t"  \
+ASM_ENDBR  \
+instr "\n\t"   \
+ASM_RET\
+".size " #func ", . - " #func "\n\t"   \
+".popsection")

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC 1/1] x86/paravirt: introduce param to disable pv sched_clock

2023-10-19 Thread Vitaly Kuznetsov
Dongli Zhang  writes:

> As mentioned in the linux kernel development document, "sched_clock() is
> used for scheduling and timestamping". While there is a default native
> implementation, many paravirtualizations have their own implementations.
>
> About KVM, it uses kvm_sched_clock_read() and there is no way to only
> disable KVM's sched_clock. The "no-kvmclock" may disable all
> paravirtualized kvmclock features.
>
>  94 static inline void kvm_sched_clock_init(bool stable)
>  95 {
>  96 if (!stable)
>  97 clear_sched_clock_stable();
>  98 kvm_sched_clock_offset = kvm_clock_read();
>  99 paravirt_set_sched_clock(kvm_sched_clock_read);
> 100
> 101 pr_info("kvm-clock: using sched offset of %llu cycles",
> 102 kvm_sched_clock_offset);
> 103
> 104 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
> 105 sizeof(((struct pvclock_vcpu_time_info 
> *)NULL)->system_time));
> 106 }
>
> There is known issue that kvmclock may drift during vCPU hotplug [1].
> Although a temporary fix is available [2], we may need a way to disable pv
> sched_clock. Nowadays, the TSC is more stable and has less performance
> overhead than kvmclock.
>
> This is to propose to introduce a global param to disable pv sched_clock
> for all paravirtualizations.
>
> Please suggest and comment if other options are better:
>
> 1. Global param (this RFC patch).
>
> 2. The kvmclock specific param (e.g., "no-vmw-sched-clock" in vmware).
>
> Indeed I like the 2nd method.
>
> 3. Enforce native sched_clock only when TSC is invariant (hyper-v method).
>
> 4. Remove and cleanup pv sched_clock, and always use pv_sched_clock() for
> all (suggested by Peter Zijlstra in [3]). Some paravirtualizations may
> want to keep the pv sched_clock.

Normally, it should be up to the hypervisor to tell the guest which
clock to use, i.e. if TSC is reliable or not. Let me put my question
this way: if TSC on the particular host is good for everything, why
does the hypervisor advertises 'kvmclock' to its guests? If for some
'historical reasons' we can't revoke features we can always introduce a
new PV feature bit saying that TSC is preferred.

1) Global param doesn't sound like a good idea to me: chances are that
people will be setting it on their guest images to workaround problems
on one hypervisor (or, rather, on one public cloud which is too lazy to
fix their hypervisor) while simultaneously creating problems on another.

2) KVM specific parameter can work, but as KVM's sched_clock is the same
as kvmclock, I'm not convinced it actually makes sense to separate the
two. Like if sched_clock is known to be bad but TSC is good, why do we
need to use PV clock at all? Having a parameter for debugging purposes
may be OK though...

3) This is Hyper-V specific, you can see that it uses a dedicated PV bit
(HV_ACCESS_TSC_INVARIANT) and not the architectural
CPUID.8007H:EDX[8]. I'm not sure we can blindly trust the later on
all hypervisors.

4) Personally, I'm not sure that relying on 'TSC is crap' detection is
100% reliable. I can imagine cases when we can't detect that fact that
while synchronized across CPUs and not going backwards, it is, for
example, ticking with an unstable frequency and PV sched clock is
supposed to give the right correction (all of them are rdtsc() based
anyways, aren't they?).

>
> To introduce a param may be easier to backport to old kernel version.
>
> References:
> [1] 
> https://lore.kernel.org/all/20230926230649.67852-1-dongli.zh...@oracle.com/
> [2] https://lore.kernel.org/all/20231018195638.1898375-1-sea...@google.com/
> [3] 
> https://lore.kernel.org/all/20231002211651.ga3...@noisy.programming.kicks-ass.net/
>
> Thank you very much for the suggestion!
>
> Signed-off-by: Dongli Zhang 
> ---
>  arch/x86/include/asm/paravirt.h |  2 +-
>  arch/x86/kernel/kvmclock.c  | 12 +++-
>  arch/x86/kernel/paravirt.c  | 18 +-
>  3 files changed, 25 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 6c8ff12140ae..f36edf608b6b 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -24,7 +24,7 @@ u64 dummy_sched_clock(void);
>  DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
>  DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
>  
> -void paravirt_set_sched_clock(u64 (*func)(void));
> +int paravirt_set_sched_clock(u64 (*func)(void));
>  
>  static __always_inline u64 paravirt_sched_clock(void)
>  {
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index fb8f52149be9..0b8bf5677d44 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -93,13 +93,15 @@ static noinstr u64 kvm_sched_clock_read(void)
>  
>  static inline void kvm_sched_clock_init(bool stable)
>  {
> - if (!stable)
> - clear_sched_clock_stable();
>   kvm_sched_clock_offset = kvm_clock_read();
> - p

Re: [PATCH v3 4/5] x86/paravirt: switch mixed paravirt/alternative calls to alternative_2

2023-10-19 Thread kernel test robot
Hi Juergen,

kernel test robot noticed the following build warnings:

[auto build test WARNING on kvm/queue]
[cannot apply to tip/x86/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Juergen-Gross/x86-paravirt-move-some-functions-and-defines-to-alternative/20231019-171709
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:
https://lore.kernel.org/r/20231019091520.14540-5-jgross%40suse.com
patch subject: [PATCH v3 4/5] x86/paravirt: switch mixed paravirt/alternative 
calls to alternative_2
reproduce: 
(https://download.01.org/0day-ci/archive/20231019/202310191920.r0c39s5h-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202310191920.r0c39s5h-...@intel.com/

# many are suggestions rather than must-fix

ERROR:BRACKET_SPACE: space prohibited before open square bracket '['
#92: FILE: arch/x86/include/asm/paravirt_types.h:281:
+#define paravirt_ptr(op)   [paravirt_opptr] "m" (pv_ops.op)

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 5/5] x86/paravirt: remove no longer needed paravirt patching code

2023-10-19 Thread kernel test robot
Hi Juergen,

kernel test robot noticed the following build warnings:

[auto build test WARNING on kvm/queue]
[cannot apply to tip/x86/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Juergen-Gross/x86-paravirt-move-some-functions-and-defines-to-alternative/20231019-171709
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:
https://lore.kernel.org/r/20231019091520.14540-6-jgross%40suse.com
patch subject: [PATCH v3 5/5] x86/paravirt: remove no longer needed paravirt 
patching code
reproduce: 
(https://download.01.org/0day-ci/archive/20231019/202310191909.ussrxzxc-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202310191909.ussrxzxc-...@intel.com/

# many are suggestions rather than must-fix

WARNING:SPLIT_STRING: quoted string split across lines
#327: FILE: arch/x86/tools/relocs.c:69:
"__x86_cpu_dev_(start|end)|"
+   "__alt_instructions(_end)?|"

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/2] virtio_pci: Switch away from deprecated irq_set_affinity_hint

2023-10-19 Thread Xuan Zhuo
On Thu, 19 Oct 2023 12:16:25 +0200, Jakub Sitnicki  wrote:
> Since commit 65c7cdedeb30 ("genirq: Provide new interfaces for affinity
> hints") irq_set_affinity_hint is being phased out.
>
> Switch to new interfaces for setting and applying irq affinity hints.
>
> Signed-off-by: Jakub Sitnicki 

Reviewed-by: Xuan Zhuo 

> ---
>  drivers/virtio/virtio_pci_common.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/virtio/virtio_pci_common.c 
> b/drivers/virtio/virtio_pci_common.c
> index 8927bc338f06..9fab99b540f1 100644
> --- a/drivers/virtio/virtio_pci_common.c
> +++ b/drivers/virtio/virtio_pci_common.c
> @@ -242,7 +242,7 @@ void vp_del_vqs(struct virtio_device *vdev)
>   if (v != VIRTIO_MSI_NO_VECTOR) {
>   int irq = pci_irq_vector(vp_dev->pci_dev, v);
>
> - irq_set_affinity_hint(irq, NULL);
> + irq_update_affinity_hint(irq, NULL);
>   free_irq(irq, vq);
>   }
>   }
> @@ -440,7 +440,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct 
> cpumask *cpu_mask)
>
>   if (vp_dev->msix_enabled) {
>   irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
> - irq_set_affinity_hint(irq, cpu_mask);
> + irq_set_affinity_and_hint(irq, cpu_mask);
>   }
>   return 0;
>  }
> --
> 2.41.0
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2] virtio_pci: Don't make an extra copy of cpu affinity mask

2023-10-19 Thread Xuan Zhuo
On Thu, 19 Oct 2023 12:16:24 +0200, Jakub Sitnicki  wrote:
> Since commit 19e226e8cc5d ("virtio: Make vp_set_vq_affinity() take a
> mask.") it is actually not needed to have a local copy of the cpu mask.


Could you give more info to prove this?

If you are right, I think you should delete all code about msix_affinity_masks?

Thanks.

>
> Pass the cpu mask we got as argument to set the irq affinity hint.
>
> Cc: Caleb Raitto 
> Signed-off-by: Jakub Sitnicki 
> ---
>  drivers/virtio/virtio_pci_common.c | 9 +
>  1 file changed, 1 insertion(+), 8 deletions(-)
>
> diff --git a/drivers/virtio/virtio_pci_common.c 
> b/drivers/virtio/virtio_pci_common.c
> index c2524a7207cf..8927bc338f06 100644
> --- a/drivers/virtio/virtio_pci_common.c
> +++ b/drivers/virtio/virtio_pci_common.c
> @@ -433,21 +433,14 @@ int vp_set_vq_affinity(struct virtqueue *vq, const 
> struct cpumask *cpu_mask)
>   struct virtio_device *vdev = vq->vdev;
>   struct virtio_pci_device *vp_dev = to_vp_device(vdev);
>   struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
> - struct cpumask *mask;
>   unsigned int irq;
>
>   if (!vq->callback)
>   return -EINVAL;
>
>   if (vp_dev->msix_enabled) {
> - mask = vp_dev->msix_affinity_masks[info->msix_vector];
>   irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
> - if (!cpu_mask)
> - irq_set_affinity_hint(irq, NULL);
> - else {
> - cpumask_copy(mask, cpu_mask);
> - irq_set_affinity_hint(irq, mask);
> - }
> + irq_set_affinity_hint(irq, cpu_mask);
>   }
>   return 0;
>  }
> --
> 2.41.0
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC 1/1] x86/paravirt: introduce param to disable pv sched_clock

2023-10-19 Thread David Woodhouse
On Thu, 2023-10-19 at 08:40 -0700, Sean Christopherson wrote:
> 
> > Normally, it should be up to the hypervisor to tell the guest which
> > clock to use, i.e. if TSC is reliable or not. Let me put my question
> > this way: if TSC on the particular host is good for everything, why
> > does the hypervisor advertises 'kvmclock' to its guests?
> 
> I suspect there are two reasons.
> 
>   1. As is likely the case in our fleet, no one revisited the set of 
> advertised
>  PV features when defining the VM shapes for a new generation of 
> hardware, or
>  whoever did the reviews wasn't aware that advertising kvmclock is 
> actually
>  suboptimal.  All the PV clock stuff in KVM is quite labyrinthian, so it's
>  not hard to imagine it getting overlooked.
> 
>   2. Legacy VMs.  If VMs have been running with a PV clock for years, forcing
>  them to switch to a new clocksource is high-risk, low-reward.

Doubly true for Xen guests (given that the Xen clocksource is identical
to the KVM clocksource).

> > If for some 'historical reasons' we can't revoke features we can always
> > introduce a new PV feature bit saying that TSC is preferred.

Don't we already have one? It's the PVCLOCK_TSC_STABLE_BIT. Why would a
guest ever use kvmclock if the PVCLOCK_TSC_STABLE_BIT is set?

The *point* in the kvmclock is that the hypervisor can mess with the
epoch/scaling to try to compensate for TSC brokenness as the host
scales/sleeps/etc.

And the *problem* with the kvmclock is that it does just that, even
when the host TSC hasn't done anything wrong and the kvmclock shouldn't
have changed at all.

If the PVCLOCK_TSC_STABLE_BIT is set, a guest should just use the guest
TSC directly without looking to the kvmclock for adjusting it.

No?




smime.p7s
Description: S/MIME cryptographic signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 2/4] vhost-vdpa: reset vendor specific mapping to initial state in .release

2023-10-19 Thread Si-Wei Liu



On 10/19/2023 7:39 AM, Eugenio Perez Martin wrote:

On Thu, Oct 19, 2023 at 10:27 AM Jason Wang  wrote:

On Thu, Oct 19, 2023 at 2:47 PM Si-Wei Liu  wrote:



On 10/18/2023 7:53 PM, Jason Wang wrote:

On Wed, Oct 18, 2023 at 4:49 PM Si-Wei Liu  wrote:


On 10/18/2023 12:00 AM, Jason Wang wrote:

Unfortunately, it's a must to stick to ABI. I agree it's a mess but we
don't have a better choice. Or we can fail the probe if userspace
doesn't ack this feature.

Antoher idea we can just do the following in vhost_vdpa reset?

config->reset()
if (IOTLB_PERSIST is not set) {
   config->reset_map()
}

Then we don't have the burden to maintain them in the parent?

Thanks

Please see my earlier response in the other email, thanks.

%<%<

First, the ideal fix would be to leave this reset_vendor_mappings()
emulation code on the individual driver itself, which already has the
broken behavior.

So the point is, not about whether the existing behavior is "broken"
or not.

Hold on, I thought earlier we all agreed upon that the existing behavior
of vendor driver self-clearing maps during .reset violates the vhost
iotlb abstraction and also breaks the .set_map/.dma_map API. This is
100% buggy driver implementation itself that we should discourage or
eliminate as much as possible (that's part of the goal for this series),

I'm not saying it's not an issue, what I'm saying is, if the fix
breaks another userspace, it's a new bug in the kernel. See what Linus
said in [1]

"If a change results in user programs breaking, it's a bug in the kernel."


but here you seem to go existentialism and suggests the very opposite
that every .set_map/.dma_map driver implementation, regardless being the
current or the new/upcoming, should unconditionally try to emulate the
broken reset behavior for the sake of not breaking older userspace.

Such "emulation" is not done at the parent level. New parents just
need to implement reset_map() or not. everything could be done inside
vhost-vDPA as pseudo code that is shown above.


Set
aside the criteria and definition for how userspace can be broken, can
we step back to the original question why we think it's broken, and what
we can do to promote good driver implementation instead of discuss the
implementation details?

I'm not sure I get the point of this question. I'm not saying we don't
need to fix, what I am saying is that such a fix must be done in a
negotiable way. And it's better if parents won't get any burden. It
can just decide to implement reset_map() or not.


Reading the below response I found my major
points are not heard even if written for quite a few times.

I try my best to not ignore any important things, but I can't promise
I will not miss any. I hope the above clarifies my points.


It's not
that I don't understand the importance of not breaking old userspace, I
appreciate your questions and extra patience, however I do feel the
"broken" part is very relevant to our discussion here.
If it's broken (in the sense of vhost IOTLB API) that you agree, I think
we should at least allow good driver implementations; and when you think
about the possibility of those valid good driver cases
(.set_map/.dma_map implementations that do not clear maps in .reset),
you might be able to see why it's coded the way as it is now.


   It's about whether we could stick to the old behaviour without
too much cost. And I believe we could.

And just to clarify here, reset_vendor_mappings() = config->reset_map()


But today there's no backend feature negotiation
between vhost-vdpa and the parent driver. Do we want to send down the
acked_backend_features to parent drivers?

There's no need to do that with the above code, or anything I missed here?

config->reset()
if (IOTLB_PERSIST is not set) {
config->reset_map()
}

Implementation issue: this implies reset_map() has to be there for every
.set_map implementations, but vendor driver implementation for custom
IOMMU could well implement DMA ops by itself instead of .reset_map. This
won't work for every set_map driver (think about the vduse case).

Well let me do it once again, reset_map() is not mandated:

config->reset()
if (IOTLB_PERSIST is not set) {
 if (config->reset_map)
   config->reset_map()

To avoid new parent drivers
I am afraid it's not just new parent drivers, but any well behaved 
driver today may well break userspace if go with this forced emulation 
code, if they have to implement reset_map for some reason (e.g. restored 
to 1:1 passthrough mapping or other default state in mapping). For new 
userspace and user driver we can guard against it using the 
IOTLB_PERSIST flag, but the above code would get a big chance to break 
setup with good driver and older userspace in practice.


And .reset_map implementation doesn't necessarily need to clear maps. 
For e.g. IOMMU API compliant driver that only needs simple DMA model for 
passthrough, all .reset_map has to do is toggle to 

Re: [PATCH 2/4] vhost-vdpa: reset vendor specific mapping to initial state in .release

2023-10-19 Thread Si-Wei Liu




On 10/17/2023 10:27 PM, Jason Wang wrote:

   If we do
this without a negotiation, IOTLB will not be clear but the Qemu will
try to re-program the IOTLB after reset. Which will break?

1) stick the exact old behaviour with just one line of check

It's not just one line of check here, the old behavior emulation has to
be done as Eugenio illustrated in the other email.

For vhost-vDPA it's just

if (IOTLB_PERSIST is acked by userspace)
 reset_map()
... and this reset_map in vhost_vdpa_cleanup can't be negotiable 
depending on IOTLB_PERSIST. Consider the case where user switches to 
virtio-vdpa after an older userspace using vhost-vdpa finished running. 
Even with buggy_virtio_reset_map in place it's unwarranted the vendor 
IOMMU can get back to the default state, e.g. ending with 1:1 
passthrough mapping. If not doing this unconditionally it will get a big 
chance to break userspace.


-Siwei



For parent, it's somehow similar:

during .reset()

if (IOTLB_PERSIST is not acked by userspace)
 reset_vendor_mappings()

Anything I missed here?


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC 1/1] x86/paravirt: introduce param to disable pv sched_clock

2023-10-19 Thread Dongli Zhang
Hi Vitaly, Sean and David,

On 10/19/23 08:40, Sean Christopherson wrote:
> On Thu, Oct 19, 2023, Vitaly Kuznetsov wrote:
>> Dongli Zhang  writes:
>>
>>> As mentioned in the linux kernel development document, "sched_clock() is
>>> used for scheduling and timestamping". While there is a default native
>>> implementation, many paravirtualizations have their own implementations.
>>>
>>> About KVM, it uses kvm_sched_clock_read() and there is no way to only
>>> disable KVM's sched_clock. The "no-kvmclock" may disable all
>>> paravirtualized kvmclock features.
> 
> ...
> 
>>> Please suggest and comment if other options are better:
>>>
>>> 1. Global param (this RFC patch).
>>>
>>> 2. The kvmclock specific param (e.g., "no-vmw-sched-clock" in vmware).
>>>
>>> Indeed I like the 2nd method.
>>>
>>> 3. Enforce native sched_clock only when TSC is invariant (hyper-v method).
>>>
>>> 4. Remove and cleanup pv sched_clock, and always use pv_sched_clock() for
>>> all (suggested by Peter Zijlstra in [3]). Some paravirtualizations may
>>> want to keep the pv sched_clock.
>>
>> Normally, it should be up to the hypervisor to tell the guest which
>> clock to use, i.e. if TSC is reliable or not. Let me put my question
>> this way: if TSC on the particular host is good for everything, why
>> does the hypervisor advertises 'kvmclock' to its guests?
> 
> I suspect there are two reasons.
> 
>   1. As is likely the case in our fleet, no one revisited the set of 
> advertised
>  PV features when defining the VM shapes for a new generation of 
> hardware, or
>  whoever did the reviews wasn't aware that advertising kvmclock is 
> actually
>  suboptimal.  All the PV clock stuff in KVM is quite labyrinthian, so it's
>  not hard to imagine it getting overlooked.
> 
>   2. Legacy VMs.  If VMs have been running with a PV clock for years, forcing
>  them to switch to a new clocksource is high-risk, low-reward.
> 
>> If for some 'historical reasons' we can't revoke features we can always
>> introduce a new PV feature bit saying that TSC is preferred.
>>
>> 1) Global param doesn't sound like a good idea to me: chances are that
>> people will be setting it on their guest images to workaround problems
>> on one hypervisor (or, rather, on one public cloud which is too lazy to
>> fix their hypervisor) while simultaneously creating problems on another.
>>
>> 2) KVM specific parameter can work, but as KVM's sched_clock is the same
>> as kvmclock, I'm not convinced it actually makes sense to separate the
>> two. Like if sched_clock is known to be bad but TSC is good, why do we
>> need to use PV clock at all? Having a parameter for debugging purposes
>> may be OK though...
>>
>> 3) This is Hyper-V specific, you can see that it uses a dedicated PV bit
>> (HV_ACCESS_TSC_INVARIANT) and not the architectural
>> CPUID.8007H:EDX[8]. I'm not sure we can blindly trust the later on
>> all hypervisors.
>>
>> 4) Personally, I'm not sure that relying on 'TSC is crap' detection is
>> 100% reliable. I can imagine cases when we can't detect that fact that
>> while synchronized across CPUs and not going backwards, it is, for
>> example, ticking with an unstable frequency and PV sched clock is
>> supposed to give the right correction (all of them are rdtsc() based
>> anyways, aren't they?).
> 
> Yeah, practically speaking, the only thing adding a knob to turn off using PV
> clocks for sched_clock will accomplish is creating an even bigger matrix of
> combinations that can cause problems, e.g. where guests end up using kvmclock
> timekeeping but not scheduling.
> 
> The explanation above and the links below fail to capture _the_ key point:
> Linux-as-a-guest already prioritizes the TSC over paravirt clocks as the 
> clocksource
> when the TSC is constant and nonstop (first spliced blob below).
> 
> What I suggested is that if the TSC is chosen over a PV clock as the 
> clocksource,
> then we have the kernel also override the sched_clock selection (second 
> spliced
> blob below).
> 
> That doesn't require the guest admin to opt-in, and doesn't create even more
> combinations to support.  It also provides for a smoother transition for when
> customers inevitably end up creating VMs on hosts that don't advertise 
> kvmclock
> (or any PV clock).

I would prefer to always leave the option to allow the guest admin to change the
decision, especially for diagnostic/workaround reason (although the kvmclock is
always buggy when tsc is buggy).


As a summary of discussion:

1. Vitaly Kuznetsov prefers global param, e.g., for the easy deployment of the
same guest image on different hypervisors.

2. Sean Christopherson prefers an automatic change of sched_clock when
clocksource is or not TSC.


However, the clocksource and TSC are different concepts.

1. The clocksource is an arch global concept. That is, all archs (e.g., x86,
arm, mips) share the same implementation to register/select clocksource. In
additon, something like HPET does not have sched_clock.


Re: [PATCH vhost v4 00/16] vdpa: Add support for vq descriptor mappings

2023-10-19 Thread Si-Wei Liu

For patches 05-16:

Reviewed-by: Si-Wei Liu 
Tested-by: Si-Wei Liu 

Thanks for the fixes!

On 10/18/2023 10:14 AM, Dragos Tatulea wrote:

This patch series adds support for vq descriptor table mappings which
are used to improve vdpa live migration downtime. The improvement comes
from using smaller mappings which take less time to create and destroy
in hw.

The first part adds the vdpa core changes from Si-Wei [0].

The second part adds support in mlx5_vdpa:
- Refactor the mr code to be able to cleanly add descriptor mappings.
- Add hardware descriptor mr support.
- Properly update iotlb for cvq during ASID switch.

Changes in v4:

- Improved the handling of empty iotlbs. See mlx5_vdpa_change_map
   section in patch "12/16 vdpa/mlx5: Improve mr upate flow".
- Fixed a invalid usage of desc_group_mkey hw vq field when the
   capability is not there. See patch
   "15/16 vdpa/mlx5: Enable hw support for vq descriptor map".

Changes in v3:

- dup_iotlb now checks for src == dst case and returns an error.
- Renamed iotlb parameter in dup_iotlb to dst.
- Removed a redundant check of the asid value.
- Fixed a commit message.
- mx5_ifc.h patch has been applied to mlx5-vhost tree. When applying
   this series please pull from that tree first.

Changes in v2:

- The "vdpa/mlx5: Enable hw support for vq descriptor mapping" change
   was split off into two patches to avoid merge conflicts into the tree
   of Linus.

   The first patch contains only changes for mlx5_ifc.h. This must be
   applied into the mlx5-vdpa tree [1] first. Once this patch is applied
   on mlx5-vdpa, the change has to be pulled fom mlx5-vdpa into the vhost
   tree and only then the remaining patches can be applied.

[0] 
https://lore.kernel.org/virtualization/1694248959-13369-1-git-send-email-si-wei@oracle.com
[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/log/?h=mlx5-vhost

Dragos Tatulea (13):
   vdpa/mlx5: Expose descriptor group mkey hw capability
   vdpa/mlx5: Create helper function for dma mappings
   vdpa/mlx5: Decouple cvq iotlb handling from hw mapping code
   vdpa/mlx5: Take cvq iotlb lock during refresh
   vdpa/mlx5: Collapse "dvq" mr add/delete functions
   vdpa/mlx5: Rename mr destroy functions
   vdpa/mlx5: Allow creation/deletion of any given mr struct
   vdpa/mlx5: Move mr mutex out of mr struct
   vdpa/mlx5: Improve mr update flow
   vdpa/mlx5: Introduce mr for vq descriptor
   vdpa/mlx5: Enable hw support for vq descriptor mapping
   vdpa/mlx5: Make iotlb helper functions more generic
   vdpa/mlx5: Update cvq iotlb mapping on ASID change

Si-Wei Liu (3):
   vdpa: introduce dedicated descriptor group for virtqueue
   vhost-vdpa: introduce descriptor group backend feature
   vhost-vdpa: uAPI to get dedicated descriptor group id

  drivers/vdpa/mlx5/core/mlx5_vdpa.h |  31 +++--
  drivers/vdpa/mlx5/core/mr.c| 194 -
  drivers/vdpa/mlx5/core/resources.c |   6 +-
  drivers/vdpa/mlx5/net/mlx5_vnet.c  | 105 +++-
  drivers/vhost/vdpa.c   |  27 
  include/linux/mlx5/mlx5_ifc.h  |   8 +-
  include/linux/mlx5/mlx5_ifc_vdpa.h |   7 +-
  include/linux/vdpa.h   |  11 ++
  include/uapi/linux/vhost.h |   8 ++
  include/uapi/linux/vhost_types.h   |   5 +
  10 files changed, 272 insertions(+), 130 deletions(-)



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v2 PATCH] vdpa_sim: implement .reset_map support

2023-10-19 Thread Si-Wei Liu



On 10/19/2023 2:29 AM, Stefano Garzarella wrote:

On Wed, Oct 18, 2023 at 04:47:48PM -0700, Si-Wei Liu wrote:



On 10/18/2023 1:05 AM, Stefano Garzarella wrote:

On Tue, Oct 17, 2023 at 10:11:33PM -0700, Si-Wei Liu wrote:

RFC only. Not tested on vdpa-sim-blk with user virtual address.
Works fine with vdpa-sim-net which uses physical address to map.

This patch is based on top of [1].

[1] 
https://lore.kernel.org/virtualization/1696928580-7520-1-git-send-email-si-wei@oracle.com/


Signed-off-by: Si-Wei Liu 

---
RFC v2:
 - initialize iotlb to passthrough mode in device add


I tested this version and I didn't see any issue ;-)

Great, thank you so much for your help on testing my patch, Stefano!


You're welcome :-)

Just for my own interest/curiosity, currently there's no vhost-vdpa 
backend client implemented for vdpa-sim-blk


Yep, we developed libblkio [1]. libblkio exposes common API to access 
block devices in userspace. It supports several drivers.
The one useful for this use case is `virtio-blk-vhost-vdpa`. Here [2] 
some examples on how to use the libblkio test suite with the 
vdpa-sim-blk.


Since QEMU 7.2, it supports libblkio drivers, so you can use the 
following options to attach a vdpa-blk device to a VM:


  -blockdev 
node-name=drive_src1,driver=virtio-blk-vhost-vdpa,path=/dev/vhost-vdpa-0,cache.direct=on 
\

  -device virtio-blk-pci,id=src1,bootindex=2,drive=drive_src1 \

For now only what we called slow-path [3][4] is supported, since the 
VQs are not directly exposed to the guest, but QEMU allocates other 
VQs (similar to shadow VQs for net) to support live-migration and QEMU 
storage features. Fast-path is on the agenda, but on pause for now.


or any vdpa block device in userspace as yet, correct? 


Do you mean with VDUSE?
In this case, yes, qemu-storage-daemon supports it, and can implement 
a virtio-blk in user space, exposing a disk image thorough VDUSE.


There is an example in libblkio as well [5] on how to start it.

So there was no test specific to vhost-vdpa that needs to be 
exercised, right?




I hope I answered above :-)
Definitely! This is exactly what I needed, it's really useful! Much 
appreciated for the detailed information!


I hadn't been aware of the latest status on libblkio drivers and qemu 
support since I last checked it (it was at some point right after KVM 
2022, sorry my knowledge too outdated). I followed your links below and 
checked a few things, looks my change shouldn't affect anything. Good to 
see all the desired pieces landed to QEMU and libblkio already as 
planned, great job done!


Cheers,
-Siwei

This reminded me that I need to write a blog post with all this 
information, I hope to do that soon!


Stefano

[1] https://gitlab.com/libblkio/libblkio
[2] 
https://gitlab.com/libblkio/libblkio/-/blob/main/tests/meson.build?ref_type=heads#L42
[3] 
https://kvmforum2022.sched.com/event/15jK5/qemu-storage-daemon-and-libblkio-exploring-new-shores-for-the-qemu-block-layer-kevin-wolf-stefano-garzarella-red-hat
[4] 
https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat
[5] 
https://gitlab.com/libblkio/libblkio/-/blob/main/tests/meson.build?ref_type=heads#L58




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: Re: PING: [PATCH] virtio-blk: fix implicit overflow on virtio_max_dma_size

2023-10-19 Thread zhenwei pi via Virtualization

Cc Paolo, Stefan, Xuan and linux-block.

On 10/19/23 17:52, Michael S. Tsirkin wrote:

On Thu, Oct 19, 2023 at 05:43:55PM +0800, zhenwei pi wrote:

Hi Michael,

This seems to have been ignored as you suggested.

LINK: https://www.spinics.net/lists/linux-virtualization/msg63015.html


Pls Cc more widely then:

Paolo Bonzini  (reviewer:VIRTIO BLOCK AND SCSI DRIVERS)
Stefan Hajnoczi  (reviewer:VIRTIO BLOCK AND SCSI DRIVERS)
Xuan Zhuo  (reviewer:VIRTIO CORE AND NET DRIVERS)
Jens Axboe  (maintainer:BLOCK LAYER)
linux-bl...@vger.kernel.org (open list:BLOCK LAYER)

would all be good people to ask to review this.



On 9/4/23 14:10, zhenwei pi wrote:

The following codes have an implicit conversion from size_t to u32:
(u32)max_size = (size_t)virtio_max_dma_size(vdev);

This may lead overflow, Ex (size_t)4G -> (u32)0. Once
virtio_max_dma_size() has a larger size than U32_MAX, use U32_MAX
instead.

Signed-off-by: zhenwei pi 
---
   drivers/block/virtio_blk.c | 4 +++-
   1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1fe011676d07..4a4b9bad551e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1313,6 +1313,7 @@ static int virtblk_probe(struct virtio_device *vdev)
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
unsigned int queue_depth;
+   size_t max_dma_size;
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
@@ -1411,7 +1412,8 @@ static int virtblk_probe(struct virtio_device *vdev)
/* No real sector limit. */
blk_queue_max_hw_sectors(q, UINT_MAX);
-   max_size = virtio_max_dma_size(vdev);
+   max_dma_size = virtio_max_dma_size(vdev);
+   max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
/* Host can optionally specify maximum segment size and number of
 * segments. */


--
zhenwei pi




--
zhenwei pi
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/4] vhost-vdpa: reset vendor specific mapping to initial state in .release

2023-10-19 Thread Jason Wang
On Fri, Oct 20, 2023 at 6:28 AM Si-Wei Liu  wrote:
>
>
>
> On 10/19/2023 7:39 AM, Eugenio Perez Martin wrote:
> > On Thu, Oct 19, 2023 at 10:27 AM Jason Wang  wrote:
> >> On Thu, Oct 19, 2023 at 2:47 PM Si-Wei Liu  wrote:
> >>>
> >>>
> >>> On 10/18/2023 7:53 PM, Jason Wang wrote:
>  On Wed, Oct 18, 2023 at 4:49 PM Si-Wei Liu  wrote:
> >
> > On 10/18/2023 12:00 AM, Jason Wang wrote:
> >>> Unfortunately, it's a must to stick to ABI. I agree it's a mess but we
> >>> don't have a better choice. Or we can fail the probe if userspace
> >>> doesn't ack this feature.
> >> Antoher idea we can just do the following in vhost_vdpa reset?
> >>
> >> config->reset()
> >> if (IOTLB_PERSIST is not set) {
> >>config->reset_map()
> >> }
> >>
> >> Then we don't have the burden to maintain them in the parent?
> >>
> >> Thanks
> > Please see my earlier response in the other email, thanks.
> >
> > %<%<
> >
> > First, the ideal fix would be to leave this reset_vendor_mappings()
> > emulation code on the individual driver itself, which already has the
> > broken behavior.
>  So the point is, not about whether the existing behavior is "broken"
>  or not.
> >>> Hold on, I thought earlier we all agreed upon that the existing behavior
> >>> of vendor driver self-clearing maps during .reset violates the vhost
> >>> iotlb abstraction and also breaks the .set_map/.dma_map API. This is
> >>> 100% buggy driver implementation itself that we should discourage or
> >>> eliminate as much as possible (that's part of the goal for this series),
> >> I'm not saying it's not an issue, what I'm saying is, if the fix
> >> breaks another userspace, it's a new bug in the kernel. See what Linus
> >> said in [1]
> >>
> >> "If a change results in user programs breaking, it's a bug in the kernel."
> >>
> >>> but here you seem to go existentialism and suggests the very opposite
> >>> that every .set_map/.dma_map driver implementation, regardless being the
> >>> current or the new/upcoming, should unconditionally try to emulate the
> >>> broken reset behavior for the sake of not breaking older userspace.
> >> Such "emulation" is not done at the parent level. New parents just
> >> need to implement reset_map() or not. everything could be done inside
> >> vhost-vDPA as pseudo code that is shown above.
> >>
> >>> Set
> >>> aside the criteria and definition for how userspace can be broken, can
> >>> we step back to the original question why we think it's broken, and what
> >>> we can do to promote good driver implementation instead of discuss the
> >>> implementation details?
> >> I'm not sure I get the point of this question. I'm not saying we don't
> >> need to fix, what I am saying is that such a fix must be done in a
> >> negotiable way. And it's better if parents won't get any burden. It
> >> can just decide to implement reset_map() or not.
> >>
> >>> Reading the below response I found my major
> >>> points are not heard even if written for quite a few times.
> >> I try my best to not ignore any important things, but I can't promise
> >> I will not miss any. I hope the above clarifies my points.
> >>
> >>> It's not
> >>> that I don't understand the importance of not breaking old userspace, I
> >>> appreciate your questions and extra patience, however I do feel the
> >>> "broken" part is very relevant to our discussion here.
> >>> If it's broken (in the sense of vhost IOTLB API) that you agree, I think
> >>> we should at least allow good driver implementations; and when you think
> >>> about the possibility of those valid good driver cases
> >>> (.set_map/.dma_map implementations that do not clear maps in .reset),
> >>> you might be able to see why it's coded the way as it is now.
> >>>
> It's about whether we could stick to the old behaviour without
>  too much cost. And I believe we could.
> 
>  And just to clarify here, reset_vendor_mappings() = config->reset_map()
> 
> > But today there's no backend feature negotiation
> > between vhost-vdpa and the parent driver. Do we want to send down the
> > acked_backend_features to parent drivers?
>  There's no need to do that with the above code, or anything I missed 
>  here?
> 
>  config->reset()
>  if (IOTLB_PERSIST is not set) {
>  config->reset_map()
>  }
> >>> Implementation issue: this implies reset_map() has to be there for every
> >>> .set_map implementations, but vendor driver implementation for custom
> >>> IOMMU could well implement DMA ops by itself instead of .reset_map. This
> >>> won't work for every set_map driver (think about the vduse case).
> >> Well let me do it once again, reset_map() is not mandated:
> >>
> >> config->reset()
> >> if (IOTLB_PERSIST is not set) {
> >>  if (config->reset_map)
> >>config->reset_map()
> > To avoid new parent drivers
> I am

Re: [PATCH 2/4] vhost-vdpa: reset vendor specific mapping to initial state in .release

2023-10-19 Thread Si-Wei Liu



On 10/19/2023 9:11 PM, Jason Wang wrote:

On Fri, Oct 20, 2023 at 6:28 AM Si-Wei Liu  wrote:



On 10/19/2023 7:39 AM, Eugenio Perez Martin wrote:

On Thu, Oct 19, 2023 at 10:27 AM Jason Wang  wrote:

On Thu, Oct 19, 2023 at 2:47 PM Si-Wei Liu  wrote:


On 10/18/2023 7:53 PM, Jason Wang wrote:

On Wed, Oct 18, 2023 at 4:49 PM Si-Wei Liu  wrote:

On 10/18/2023 12:00 AM, Jason Wang wrote:

Unfortunately, it's a must to stick to ABI. I agree it's a mess but we
don't have a better choice. Or we can fail the probe if userspace
doesn't ack this feature.

Antoher idea we can just do the following in vhost_vdpa reset?

config->reset()
if (IOTLB_PERSIST is not set) {
config->reset_map()
}

Then we don't have the burden to maintain them in the parent?

Thanks

Please see my earlier response in the other email, thanks.

%<%<

First, the ideal fix would be to leave this reset_vendor_mappings()
emulation code on the individual driver itself, which already has the
broken behavior.

So the point is, not about whether the existing behavior is "broken"
or not.

Hold on, I thought earlier we all agreed upon that the existing behavior
of vendor driver self-clearing maps during .reset violates the vhost
iotlb abstraction and also breaks the .set_map/.dma_map API. This is
100% buggy driver implementation itself that we should discourage or
eliminate as much as possible (that's part of the goal for this series),

I'm not saying it's not an issue, what I'm saying is, if the fix
breaks another userspace, it's a new bug in the kernel. See what Linus
said in [1]

"If a change results in user programs breaking, it's a bug in the kernel."


but here you seem to go existentialism and suggests the very opposite
that every .set_map/.dma_map driver implementation, regardless being the
current or the new/upcoming, should unconditionally try to emulate the
broken reset behavior for the sake of not breaking older userspace.

Such "emulation" is not done at the parent level. New parents just
need to implement reset_map() or not. everything could be done inside
vhost-vDPA as pseudo code that is shown above.


Set
aside the criteria and definition for how userspace can be broken, can
we step back to the original question why we think it's broken, and what
we can do to promote good driver implementation instead of discuss the
implementation details?

I'm not sure I get the point of this question. I'm not saying we don't
need to fix, what I am saying is that such a fix must be done in a
negotiable way. And it's better if parents won't get any burden. It
can just decide to implement reset_map() or not.


Reading the below response I found my major
points are not heard even if written for quite a few times.

I try my best to not ignore any important things, but I can't promise
I will not miss any. I hope the above clarifies my points.


It's not
that I don't understand the importance of not breaking old userspace, I
appreciate your questions and extra patience, however I do feel the
"broken" part is very relevant to our discussion here.
If it's broken (in the sense of vhost IOTLB API) that you agree, I think
we should at least allow good driver implementations; and when you think
about the possibility of those valid good driver cases
(.set_map/.dma_map implementations that do not clear maps in .reset),
you might be able to see why it's coded the way as it is now.


It's about whether we could stick to the old behaviour without
too much cost. And I believe we could.

And just to clarify here, reset_vendor_mappings() = config->reset_map()


But today there's no backend feature negotiation
between vhost-vdpa and the parent driver. Do we want to send down the
acked_backend_features to parent drivers?

There's no need to do that with the above code, or anything I missed here?

config->reset()
if (IOTLB_PERSIST is not set) {
 config->reset_map()
}

Implementation issue: this implies reset_map() has to be there for every
.set_map implementations, but vendor driver implementation for custom
IOMMU could well implement DMA ops by itself instead of .reset_map. This
won't work for every set_map driver (think about the vduse case).

Well let me do it once again, reset_map() is not mandated:

config->reset()
if (IOTLB_PERSIST is not set) {
  if (config->reset_map)
config->reset_map()

To avoid new parent drivers

I am afraid it's not just new parent drivers, but any well behaved
driver today may well break userspace if go with this forced emulation
code, if they have to implement reset_map for some reason (e.g. restored
to 1:1 passthrough mapping or other default state in mapping). For new
userspace and user driver we can guard against it using the
IOTLB_PERSIST flag, but the above code would get a big chance to break
setup with good driver and older userspace in practice.

And .reset_map implementation doesn't necessarily need to clear maps.
For e.g. IOMMU API compliant driv

Re: [PATCH net-next v1 08/19] virtio_net: sq support premapped mode

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> If the xsk is enabling, the xsk tx will share the send queue.
> But the xsk requires that the send queue use the premapped mode.
> So the send queue must support premapped mode.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio/main.c   | 108 
>  drivers/net/virtio/virtio_net.h |  54 +++-
>  2 files changed, 149 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index 8da84ea9bcbe..02d27101fef1 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -514,20 +514,104 @@ static void *virtnet_rq_alloc(struct virtnet_rq *rq, 
> u32 size, gfp_t gfp)
> return buf;
>  }
>
> -static void virtnet_rq_set_premapped(struct virtnet_info *vi)
> +static int virtnet_sq_set_premapped(struct virtnet_sq *sq)
>  {
> -   int i;
> +   struct virtnet_sq_dma *d;
> +   int err, size, i;
>
> -   /* disable for big mode */
> -   if (!vi->mergeable_rx_bufs && vi->big_packets)
> -   return;

Not specific to this patch but any plan to fix the big mode?


> +   size = virtqueue_get_vring_size(sq->vq);
> +
> +   size += MAX_SKB_FRAGS + 2;
> +
> +   sq->dmainfo.head = kcalloc(size, sizeof(*sq->dmainfo.head), 
> GFP_KERNEL);
> +   if (!sq->dmainfo.head)
> +   return -ENOMEM;
> +
> +   err = virtqueue_set_dma_premapped(sq->vq);
> +   if (err) {
> +   kfree(sq->dmainfo.head);
> +   return err;
> +   }
> +
> +   sq->dmainfo.free = NULL;
> +
> +   sq->do_dma = true;
> +
> +   for (i = 0; i < size; ++i) {
> +   d = &sq->dmainfo.head[i];
> +
> +   d->next = sq->dmainfo.free;
> +   sq->dmainfo.free = d;
> +   }
> +
> +   return 0;
> +}
> +
> +static void virtnet_set_premapped(struct virtnet_info *vi)
> +{
> +   int i;
>
> for (i = 0; i < vi->max_queue_pairs; i++) {
> -   if (virtqueue_set_dma_premapped(vi->rq[i].vq))
> +   if (!virtnet_sq_set_premapped(&vi->sq[i]))
> +   vi->sq[i].do_dma = true;
> +
> +   /* disable for big mode */
> +   if (!vi->mergeable_rx_bufs && vi->big_packets)
> continue;
>
> -   vi->rq[i].do_dma = true;
> +   if (!virtqueue_set_dma_premapped(vi->rq[i].vq))
> +   vi->rq[i].do_dma = true;
> +   }
> +}
> +
> +static struct virtnet_sq_dma *virtnet_sq_map_sg(struct virtnet_sq *sq, int 
> nents, void *data)
> +{
> +   struct virtnet_sq_dma *d, *head;
> +   struct scatterlist *sg;
> +   int i;
> +
> +   head = NULL;
> +
> +   for_each_sg(sq->sg, sg, nents, i) {
> +   sg->dma_address = virtqueue_dma_map_single_attrs(sq->vq, 
> sg_virt(sg),
> +sg->length,
> +
> DMA_TO_DEVICE, 0);
> +   if (virtqueue_dma_mapping_error(sq->vq, sg->dma_address))
> +   goto err;
> +
> +   d = sq->dmainfo.free;
> +   sq->dmainfo.free = d->next;
> +
> +   d->addr = sg->dma_address;
> +   d->len = sg->length;
> +
> +   d->next = head;
> +   head = d;

It's really a pity that we need to duplicate those DMA metata twice.
Could we invent a new API to just fetch it from the virtio core?

> +   }
> +
> +   head->data = data;
> +
> +   return (void *)((unsigned long)head | ((unsigned long)data & 
> VIRTIO_XMIT_DATA_MASK));

If we packed everything into dmainfo, we can leave the type (XDP vs
skb) there to avoid trick like packing it into the pointer here?

Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next v1 09/19] virtio_net: xsk: bind/unbind xsk

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> This patch implement the logic of bind/unbind xsk pool to sq and rq.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio/Makefile |   2 +-
>  drivers/net/virtio/main.c   |  10 +-
>  drivers/net/virtio/virtio_net.h |  18 
>  drivers/net/virtio/xsk.c| 186 
>  drivers/net/virtio/xsk.h|   7 ++
>  5 files changed, 216 insertions(+), 7 deletions(-)
>  create mode 100644 drivers/net/virtio/xsk.c
>  create mode 100644 drivers/net/virtio/xsk.h
>
> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> index 15ed7c97fd4f..8c2a884d2dba 100644
> --- a/drivers/net/virtio/Makefile
> +++ b/drivers/net/virtio/Makefile
> @@ -5,4 +5,4 @@
>
>  obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
>
> -virtio_net-y := main.o
> +virtio_net-y := main.o xsk.o
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index 02d27101fef1..38733a782f12 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -8,7 +8,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  #include 
>  #include 
> @@ -139,9 +138,6 @@ struct virtio_net_common_hdr {
> };
>  };
>
> -static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf);
> -static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
> -
>  static void *xdp_to_ptr(struct xdp_frame *ptr)
>  {
> return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
> @@ -3664,6 +3660,8 @@ static int virtnet_xdp(struct net_device *dev, struct 
> netdev_bpf *xdp)
> switch (xdp->command) {
> case XDP_SETUP_PROG:
> return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
> +   case XDP_SETUP_XSK_POOL:
> +   return virtnet_xsk_pool_setup(dev, xdp);
> default:
> return -EINVAL;
> }
> @@ -3849,7 +3847,7 @@ static void free_receive_page_frags(struct virtnet_info 
> *vi)
> }
>  }
>
> -static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
> +void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
>  {
> if (!virtnet_is_xdp_frame(buf))
> dev_kfree_skb(buf);
> @@ -3857,7 +3855,7 @@ static void virtnet_sq_free_unused_buf(struct virtqueue 
> *vq, void *buf)
> xdp_return_frame(virtnet_ptr_to_xdp(buf));
>  }
>
> -static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> +void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
>  {
> struct virtnet_info *vi = vq->vdev->priv;
> int i = vq2rxq(vq);
> diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> index cc742756e19a..9e69b6c5921b 100644
> --- a/drivers/net/virtio/virtio_net.h
> +++ b/drivers/net/virtio/virtio_net.h
> @@ -5,6 +5,8 @@
>
>  #include 
>  #include 
> +#include 
> +#include 
>
>  #define VIRTIO_XDP_FLAGBIT(0)
>  #define VIRTIO_XMIT_DATA_MASK (VIRTIO_XDP_FLAG)
> @@ -94,6 +96,11 @@ struct virtnet_sq {
> bool do_dma;
>
> struct virtnet_sq_dma_head dmainfo;
> +   struct {
> +   struct xsk_buff_pool __rcu *pool;
> +
> +   dma_addr_t hdr_dma_address;
> +   } xsk;
>  };
>
>  /* Internal representation of a receive virtqueue */
> @@ -134,6 +141,13 @@ struct virtnet_rq {
>
> /* Do dma by self */
> bool do_dma;
> +
> +   struct {
> +   struct xsk_buff_pool __rcu *pool;
> +
> +   /* xdp rxq used by xsk */
> +   struct xdp_rxq_info xdp_rxq;
> +   } xsk;
>  };
>
>  struct virtnet_info {
> @@ -218,6 +232,8 @@ struct virtnet_info {
> struct failover *failover;
>  };
>
> +#include "xsk.h"
> +

Any reason we don't do it with other headers?

>  static inline bool virtnet_is_xdp_frame(void *ptr)
>  {
> return (unsigned long)ptr & VIRTIO_XDP_FLAG;
> @@ -308,4 +324,6 @@ void virtnet_rx_pause(struct virtnet_info *vi, struct 
> virtnet_rq *rq);
>  void virtnet_rx_resume(struct virtnet_info *vi, struct virtnet_rq *rq);
>  void virtnet_tx_pause(struct virtnet_info *vi, struct virtnet_sq *sq);
>  void virtnet_tx_resume(struct virtnet_info *vi, struct virtnet_sq *sq);
> +void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
> +void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf);
>  #endif
> diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> new file mode 100644
> index ..01962a3f
> --- /dev/null
> +++ b/drivers/net/virtio/xsk.c
> @@ -0,0 +1,186 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * virtio-net xsk
> + */
> +
> +#include "virtio_net.h"
> +
> +static struct virtio_net_hdr_mrg_rxbuf xsk_hdr;
> +
> +static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct 
> virtnet_rq *rq,
> +   struct xsk_buff_pool *pool)
> +{
> +   int err, qindex;
> +
> +   qindex = rq - vi->rq;
> +
> +   if (poo

Re: [PATCH net-next v1 10/19] virtio_net: xsk: prevent disable tx napi

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> Since xsk's TX queue is consumed by TX NAPI, if sq is bound to xsk, then
> we must stop tx napi from being disabled.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks


> ---
>  drivers/net/virtio/main.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index 38733a782f12..b320770e5f4e 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -3203,7 +3203,7 @@ static int virtnet_set_coalesce(struct net_device *dev,
> struct netlink_ext_ack *extack)
>  {
> struct virtnet_info *vi = netdev_priv(dev);
> -   int ret, queue_number, napi_weight;
> +   int ret, queue_number, napi_weight, i;
> bool update_napi = false;
>
> /* Can't change NAPI weight if the link is up */
> @@ -3232,6 +3232,14 @@ static int virtnet_set_coalesce(struct net_device *dev,
> return ret;
>
> if (update_napi) {
> +   /* xsk xmit depends on the tx napi. So if xsk is active,
> +* prevent modifications to tx napi.
> +*/
> +   for (i = queue_number; i < vi->max_queue_pairs; i++) {
> +   if (rtnl_dereference(vi->sq[i].xsk.pool))
> +   return -EBUSY;
> +   }
> +
> for (; queue_number < vi->max_queue_pairs; queue_number++)
> vi->sq[queue_number].napi.weight = napi_weight;
> }
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next v1 11/19] virtio_net: xsk: tx: support tx

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> The driver's tx napi is very important for XSK. It is responsible for
> obtaining data from the XSK queue and sending it out.
>
> At the beginning, we need to trigger tx napi.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio/main.c   |  18 +-
>  drivers/net/virtio/virtio_net.h |   3 +-
>  drivers/net/virtio/xsk.c| 108 
>  drivers/net/virtio/xsk.h|  13 
>  4 files changed, 140 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index b320770e5f4e..a08429bef61f 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -2054,7 +2054,9 @@ static int virtnet_poll_tx(struct napi_struct *napi, 
> int budget)
> struct virtnet_sq *sq = container_of(napi, struct virtnet_sq, napi);
> struct virtnet_info *vi = sq->vq->vdev->priv;
> unsigned int index = vq2txq(sq->vq);
> +   struct xsk_buff_pool *pool;
> struct netdev_queue *txq;
> +   int busy = 0;
> int opaque;
> bool done;
>
> @@ -2067,11 +2069,25 @@ static int virtnet_poll_tx(struct napi_struct *napi, 
> int budget)
> txq = netdev_get_tx_queue(vi->dev, index);
> __netif_tx_lock(txq, raw_smp_processor_id());
> virtqueue_disable_cb(sq->vq);
> -   free_old_xmit(sq, true);
> +
> +   rcu_read_lock();
> +   pool = rcu_dereference(sq->xsk.pool);
> +   if (pool) {
> +   busy |= virtnet_xsk_xmit(sq, pool, budget);
> +   rcu_read_unlock();
> +   } else {
> +   rcu_read_unlock();
> +   free_old_xmit(sq, true);
> +   }
>
> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> netif_tx_wake_queue(txq);
>
> +   if (busy) {
> +   __netif_tx_unlock(txq);
> +   return budget;
> +   }
> +
> opaque = virtqueue_enable_cb_prepare(sq->vq);
>
> done = napi_complete_done(napi, 0);
> diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> index 9e69b6c5921b..3bbb1f5baad5 100644
> --- a/drivers/net/virtio/virtio_net.h
> +++ b/drivers/net/virtio/virtio_net.h
> @@ -9,7 +9,8 @@
>  #include 
>
>  #define VIRTIO_XDP_FLAGBIT(0)
> -#define VIRTIO_XMIT_DATA_MASK (VIRTIO_XDP_FLAG)
> +#define VIRTIO_XSK_FLAGBIT(1)
> +#define VIRTIO_XMIT_DATA_MASK (VIRTIO_XDP_FLAG | VIRTIO_XSK_FLAG)
>
>  /* RX packet size EWMA. The average packet size is used to determine the 
> packet
>   * buffer size when refilling RX rings. As the entire RX ring may be refilled
> diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> index 01962a3f..0e775a9d270f 100644
> --- a/drivers/net/virtio/xsk.c
> +++ b/drivers/net/virtio/xsk.c
> @@ -7,6 +7,114 @@
>
>  static struct virtio_net_hdr_mrg_rxbuf xsk_hdr;
>
> +static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
> +{
> +   sg->dma_address = addr;
> +   sg->length = len;
> +}
> +
> +static void virtnet_xsk_check_queue(struct virtnet_sq *sq)
> +{
> +   struct virtnet_info *vi = sq->vq->vdev->priv;
> +   struct net_device *dev = vi->dev;
> +   int qnum = sq - vi->sq;
> +
> +   /* If it is a raw buffer queue, it does not check whether the status
> +* of the queue is stopped when sending. So there is no need to check
> +* the situation of the raw buffer queue.
> +*/
> +   if (virtnet_is_xdp_raw_buffer_queue(vi, qnum))
> +   return;
> +
> +   /* If this sq is not the exclusive queue of the current cpu,
> +* then it may be called by start_xmit, so check it running out
> +* of space.
> +*
> +* Stop the queue to avoid getting packets that we are
> +* then unable to transmit. Then wait the tx interrupt.
> +*/
> +   if (sq->vq->num_free < 2 + MAX_SKB_FRAGS)
> +   netif_stop_subqueue(dev, qnum);
> +}
> +
> +static int virtnet_xsk_xmit_one(struct virtnet_sq *sq,
> +   struct xsk_buff_pool *pool,
> +   struct xdp_desc *desc)
> +{
> +   struct virtnet_info *vi;
> +   dma_addr_t addr;
> +
> +   vi = sq->vq->vdev->priv;
> +
> +   addr = xsk_buff_raw_get_dma(pool, desc->addr);
> +   xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
> +
> +   sg_init_table(sq->sg, 2);
> +
> +   sg_fill_dma(sq->sg, sq->xsk.hdr_dma_address, vi->hdr_len);
> +   sg_fill_dma(sq->sg + 1, addr, desc->len);
> +
> +   return virtqueue_add_outbuf(sq->vq, sq->sg, 2,
> +   virtnet_xsk_to_ptr(desc->len), 
> GFP_ATOMIC);
> +}
> +
> +static int virtnet_xsk_xmit_batch(struct virtnet_sq *sq,
> + struct xsk_buff_pool *pool,
> + unsigned int budget,
> + struct virtnet_sq_stats *stats)
> +{
> +   str

Re: [PATCH net-next v1 12/19] virtio_net: xsk: tx: support wakeup

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> xsk wakeup is used to trigger the logic for xsk xmit by xsk framework or
> user.
>
> Virtio-Net does not support to actively generate an interruption, so it
> tries to trigger tx NAPI on the tx interrupt cpu.
>
> Consider the effect of cache. When interrupt triggers, it is
> generally fixed on a CPU. It is better to start TX Napi on the same
> CPU.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio/main.c   |  3 ++
>  drivers/net/virtio/virtio_net.h |  8 +
>  drivers/net/virtio/xsk.c| 57 +
>  drivers/net/virtio/xsk.h|  1 +
>  4 files changed, 69 insertions(+)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index a08429bef61f..1a21352e 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -2066,6 +2066,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, 
> int budget)
> return 0;
> }
>
> +   sq->xsk.last_cpu = smp_processor_id();
> +
> txq = netdev_get_tx_queue(vi->dev, index);
> __netif_tx_lock(txq, raw_smp_processor_id());
> virtqueue_disable_cb(sq->vq);
> @@ -3770,6 +3772,7 @@ static const struct net_device_ops virtnet_netdev = {
> .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> .ndo_bpf= virtnet_xdp,
> .ndo_xdp_xmit   = virtnet_xdp_xmit,
> +   .ndo_xsk_wakeup = virtnet_xsk_wakeup,
> .ndo_features_check = passthru_features_check,
> .ndo_get_phys_port_name = virtnet_get_phys_port_name,
> .ndo_set_features   = virtnet_set_features,
> diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> index 3bbb1f5baad5..7c72a8bb1813 100644
> --- a/drivers/net/virtio/virtio_net.h
> +++ b/drivers/net/virtio/virtio_net.h
> @@ -101,6 +101,14 @@ struct virtnet_sq {
> struct xsk_buff_pool __rcu *pool;
>
> dma_addr_t hdr_dma_address;
> +
> +   u32 last_cpu;
> +   struct __call_single_data csd;
> +
> +   /* The lock to prevent the repeat of calling
> +* smp_call_function_single_async().
> +*/
> +   spinlock_t ipi_lock;
> } xsk;
>  };
>
> diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> index 0e775a9d270f..973e783260c3 100644
> --- a/drivers/net/virtio/xsk.c
> +++ b/drivers/net/virtio/xsk.c
> @@ -115,6 +115,60 @@ bool virtnet_xsk_xmit(struct virtnet_sq *sq, struct 
> xsk_buff_pool *pool,
> return sent == budget;
>  }
>
> +static void virtnet_remote_napi_schedule(void *info)
> +{
> +   struct virtnet_sq *sq = info;
> +
> +   virtnet_vq_napi_schedule(&sq->napi, sq->vq);
> +}
> +
> +static void virtnet_remote_raise_napi(struct virtnet_sq *sq)
> +{
> +   u32 last_cpu, cur_cpu;
> +
> +   last_cpu = sq->xsk.last_cpu;
> +   cur_cpu = get_cpu();
> +
> +   /* On remote cpu, softirq will run automatically when ipi irq exit. On
> +* local cpu, smp_call_xxx will not trigger ipi interrupt, then 
> softirq
> +* cannot be triggered automatically. So Call local_bh_enable after to
> +* trigger softIRQ processing.
> +*/
> +   if (last_cpu == cur_cpu) {
> +   local_bh_disable();
> +   virtnet_vq_napi_schedule(&sq->napi, sq->vq);
> +   local_bh_enable();
> +   } else {
> +   if (spin_trylock(&sq->xsk.ipi_lock)) {
> +   smp_call_function_single_async(last_cpu, 
> &sq->xsk.csd);
> +   spin_unlock(&sq->xsk.ipi_lock);
> +   }
> +   }

Is there any number to show whether it's worth it for an IPI here? For
example, GVE doesn't do this.

Thanks


> +
> +   put_cpu();
> +}
> +
> +int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
> +{
> +   struct virtnet_info *vi = netdev_priv(dev);
> +   struct virtnet_sq *sq;
> +
> +   if (!netif_running(dev))
> +   return -ENETDOWN;
> +
> +   if (qid >= vi->curr_queue_pairs)
> +   return -EINVAL;
> +
> +   sq = &vi->sq[qid];
> +
> +   if (napi_if_scheduled_mark_missed(&sq->napi))
> +   return 0;
> +
> +   virtnet_remote_raise_napi(sq);
> +
> +   return 0;
> +}
> +
>  static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct 
> virtnet_rq *rq,
> struct xsk_buff_pool *pool)
>  {
> @@ -240,6 +294,9 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
>
> sq->xsk.hdr_dma_address = hdr_dma;
>
> +   INIT_CSD(&sq->xsk.csd, virtnet_remote_napi_schedule, sq);
> +   spin_lock_init(&sq->xsk.ipi_lock);
> +
> return 0;
>
>  err_sq:
> diff --git a/drivers/net/virtio/xsk.h b/drivers/net/virtio/xsk.h
> index 73ca8cd5308b..1bd19dcda649 100644
> --- a/drivers/net/virtio/xsk.h
> +++ b/drivers/net/virtio/xsk.h
> @@ -17,4 

Re: [PATCH net-next v1 14/19] virtio_net: xsk: tx: virtnet_sq_free_unused_buf() check xsk buffer

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> virtnet_sq_free_unused_buf() check xsk buffer.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks


> ---
>  drivers/net/virtio/main.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index 1a21352e..58bb38f9b453 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -3876,10 +3876,12 @@ static void free_receive_page_frags(struct 
> virtnet_info *vi)
>
>  void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
>  {
> -   if (!virtnet_is_xdp_frame(buf))
> +   if (virtnet_is_skb_ptr(buf))
> dev_kfree_skb(buf);
> -   else
> +   else if (virtnet_is_xdp_frame(buf))
> xdp_return_frame(virtnet_ptr_to_xdp(buf));
> +
> +   /* xsk buffer do not need handle. */
>  }
>
>  void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next v1 15/19] virtio_net: xsk: rx: introduce add_recvbuf_xsk()

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> Implement the logic of filling vq with XSK buffer.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio/main.c   | 13 +++
>  drivers/net/virtio/virtio_net.h |  5 +++
>  drivers/net/virtio/xsk.c| 66 -
>  drivers/net/virtio/xsk.h|  2 +
>  4 files changed, 85 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index 58bb38f9b453..0e740447b142 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -1787,9 +1787,20 @@ static int add_recvbuf_mergeable(struct virtnet_info 
> *vi,
>  static bool try_fill_recv(struct virtnet_info *vi, struct virtnet_rq *rq,
>   gfp_t gfp)
>  {
> +   struct xsk_buff_pool *pool;
> int err;
> bool oom;
>
> +   rcu_read_lock();

A question here: should we sync with refill work during rx_pause?

> +   pool = rcu_dereference(rq->xsk.pool);
> +   if (pool) {
> +   err = virtnet_add_recvbuf_xsk(vi, rq, pool, gfp);
> +   oom = err == -ENOMEM;
> +   rcu_read_unlock();
> +   goto kick;
> +   }
> +   rcu_read_unlock();

And if we synchronize with that there's probably no need for the rcu
and we can merge the logic with the following ones?

Thanks


> +
> do {
> if (vi->mergeable_rx_bufs)
> err = add_recvbuf_mergeable(vi, rq, gfp);
> @@ -1802,6 +1813,8 @@ static bool try_fill_recv(struct virtnet_info *vi, 
> struct virtnet_rq *rq,
> if (err)
> break;
> } while (rq->vq->num_free);
> +
> +kick:
> if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
> unsigned long flags;
>
> diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> index d4e620a084f4..6e71622fca45 100644
> --- a/drivers/net/virtio/virtio_net.h
> +++ b/drivers/net/virtio/virtio_net.h
> @@ -156,6 +156,11 @@ struct virtnet_rq {
>
> /* xdp rxq used by xsk */
> struct xdp_rxq_info xdp_rxq;
> +
> +   struct xdp_buff **xsk_buffs;
> +   u32 nxt_idx;
> +   u32 num;
> +   u32 size;
> } xsk;
>  };
>
> diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> index 973e783260c3..841fb078882a 100644
> --- a/drivers/net/virtio/xsk.c
> +++ b/drivers/net/virtio/xsk.c
> @@ -37,6 +37,58 @@ static void virtnet_xsk_check_queue(struct virtnet_sq *sq)
> netif_stop_subqueue(dev, qnum);
>  }
>
> +static int virtnet_add_recvbuf_batch(struct virtnet_info *vi, struct 
> virtnet_rq *rq,
> +struct xsk_buff_pool *pool, gfp_t gfp)
> +{
> +   struct xdp_buff **xsk_buffs;
> +   dma_addr_t addr;
> +   u32 len, i;
> +   int err = 0;
> +
> +   xsk_buffs = rq->xsk.xsk_buffs;
> +
> +   if (rq->xsk.nxt_idx >= rq->xsk.num) {
> +   rq->xsk.num = xsk_buff_alloc_batch(pool, xsk_buffs, 
> rq->xsk.size);
> +   if (!rq->xsk.num)
> +   return -ENOMEM;
> +   rq->xsk.nxt_idx = 0;
> +   }
> +
> +   while (rq->xsk.nxt_idx < rq->xsk.num) {
> +   i = rq->xsk.nxt_idx;
> +
> +   /* use the part of XDP_PACKET_HEADROOM as the virtnet hdr 
> space */
> +   addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len;
> +   len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len;
> +
> +   sg_init_table(rq->sg, 1);
> +   sg_fill_dma(rq->sg, addr, len);
> +
> +   err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, xsk_buffs[i], 
> gfp);
> +   if (err)
> +   return err;
> +
> +   rq->xsk.nxt_idx++;
> +   }
> +
> +   return 0;
> +}
> +
> +int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct virtnet_rq *rq,
> +   struct xsk_buff_pool *pool, gfp_t gfp)
> +{
> +   int err;
> +
> +   do {
> +   err = virtnet_add_recvbuf_batch(vi, rq, pool, gfp);
> +   if (err)
> +   return err;
> +
> +   } while (rq->vq->num_free);
> +
> +   return 0;
> +}
> +
>  static int virtnet_xsk_xmit_one(struct virtnet_sq *sq,
> struct xsk_buff_pool *pool,
> struct xdp_desc *desc)
> @@ -244,7 +296,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
> struct virtnet_sq *sq;
> struct device *dma_dev;
> dma_addr_t hdr_dma;
> -   int err;
> +   int err, size;
>
> /* In big_packets mode, xdp cannot work, so there is no need to
>  * initialize xsk of rq.
> @@ -276,6 +328,16 @@ static int virtnet_xsk_pool_enable(struct net_device 
> *dev,
> if (!dma_dev)
> return -EPERM;
>
> +   size = vir

Re: [PATCH net-next v1 16/19] virtio_net: xsk: rx: introduce receive_xsk() to recv xsk buffer

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> Implementing the logic of xsk rx. If this packet is not for XSK
> determined in XDP, then we need to copy once to generate a SKB.
> If it is for XSK, it is a zerocopy receive packet process.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio/main.c   |  14 ++--
>  drivers/net/virtio/virtio_net.h |   4 ++
>  drivers/net/virtio/xsk.c| 120 
>  drivers/net/virtio/xsk.h|   4 ++
>  4 files changed, 137 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/virtio/main.c b/drivers/net/virtio/main.c
> index 0e740447b142..003dd67ab707 100644
> --- a/drivers/net/virtio/main.c
> +++ b/drivers/net/virtio/main.c
> @@ -822,10 +822,10 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> }
>  }
>
> -static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff 
> *xdp,
> -  struct net_device *dev,
> -  unsigned int *xdp_xmit,
> -  struct virtnet_rq_stats *stats)
> +int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
> +   struct net_device *dev,
> +   unsigned int *xdp_xmit,
> +   struct virtnet_rq_stats *stats)
>  {
> struct xdp_frame *xdpf;
> int err;
> @@ -1589,13 +1589,17 @@ static void receive_buf(struct virtnet_info *vi, 
> struct virtnet_rq *rq,
> return;
> }
>
> -   if (vi->mergeable_rx_bufs)
> +   rcu_read_lock();
> +   if (rcu_dereference(rq->xsk.pool))
> +   skb = virtnet_receive_xsk(dev, vi, rq, buf, len, xdp_xmit, 
> stats);
> +   else if (vi->mergeable_rx_bufs)
> skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
> stats);
> else if (vi->big_packets)
> skb = receive_big(dev, vi, rq, buf, len, stats);
> else
> skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, 
> stats);
> +   rcu_read_unlock();
>
> if (unlikely(!skb))
> return;
> diff --git a/drivers/net/virtio/virtio_net.h b/drivers/net/virtio/virtio_net.h
> index 6e71622fca45..fd7f34703c9b 100644
> --- a/drivers/net/virtio/virtio_net.h
> +++ b/drivers/net/virtio/virtio_net.h
> @@ -346,6 +346,10 @@ static inline bool 
> virtnet_is_xdp_raw_buffer_queue(struct virtnet_info *vi, int
> return false;
>  }
>
> +int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
> +   struct net_device *dev,
> +   unsigned int *xdp_xmit,
> +   struct virtnet_rq_stats *stats);
>  void virtnet_rx_pause(struct virtnet_info *vi, struct virtnet_rq *rq);
>  void virtnet_rx_resume(struct virtnet_info *vi, struct virtnet_rq *rq);
>  void virtnet_tx_pause(struct virtnet_info *vi, struct virtnet_sq *sq);
> diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> index 841fb078882a..f1c64414fac9 100644
> --- a/drivers/net/virtio/xsk.c
> +++ b/drivers/net/virtio/xsk.c
> @@ -13,6 +13,18 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t 
> addr, u32 len)
> sg->length = len;
>  }
>
> +static unsigned int virtnet_receive_buf_num(struct virtnet_info *vi, char 
> *buf)
> +{
> +   struct virtio_net_hdr_mrg_rxbuf *hdr;
> +
> +   if (vi->mergeable_rx_bufs) {
> +   hdr = (struct virtio_net_hdr_mrg_rxbuf *)buf;
> +   return virtio16_to_cpu(vi->vdev, hdr->num_buffers);
> +   }
> +
> +   return 1;
> +}
> +
>  static void virtnet_xsk_check_queue(struct virtnet_sq *sq)
>  {
> struct virtnet_info *vi = sq->vq->vdev->priv;
> @@ -37,6 +49,114 @@ static void virtnet_xsk_check_queue(struct virtnet_sq *sq)
> netif_stop_subqueue(dev, qnum);
>  }
>
> +static void merge_drop_follow_xdp(struct net_device *dev,
> + struct virtnet_rq *rq,
> + u32 num_buf,
> + struct virtnet_rq_stats *stats)
> +{
> +   struct xdp_buff *xdp;
> +   u32 len;
> +
> +   while (num_buf-- > 1) {
> +   xdp = virtqueue_get_buf(rq->vq, &len);
> +   if (unlikely(!xdp)) {
> +   pr_debug("%s: rx error: %d buffers missing\n",
> +dev->name, num_buf);
> +   dev->stats.rx_length_errors++;
> +   break;
> +   }
> +   stats->bytes += len;
> +   xsk_buff_free(xdp);
> +   }
> +}
> +
> +static struct sk_buff *construct_skb(struct virtnet_rq *rq,
> +struct xdp_buff *xdp)
> +{
> +   unsigned int metasize = xdp->data - xdp->data_meta;
> +   struct sk_buff *skb;
> +   unsigned int size;
> +
> +   size = xdp->data_end - xdp->data_hard_start;
> +   skb 

Re: [PATCH net-next v1 18/19] virtio_net: update tx timeout record

2023-10-19 Thread Jason Wang
On Mon, Oct 16, 2023 at 8:01 PM Xuan Zhuo  wrote:
>
> If send queue sent some packets, we update the tx timeout
> record to prevent the tx timeout.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks


> ---
>  drivers/net/virtio/xsk.c | 10 ++
>  1 file changed, 10 insertions(+)
>
> diff --git a/drivers/net/virtio/xsk.c b/drivers/net/virtio/xsk.c
> index f1c64414fac9..5d3de505c56c 100644
> --- a/drivers/net/virtio/xsk.c
> +++ b/drivers/net/virtio/xsk.c
> @@ -274,6 +274,16 @@ bool virtnet_xsk_xmit(struct virtnet_sq *sq, struct 
> xsk_buff_pool *pool,
>
> virtnet_xsk_check_queue(sq);
>
> +   if (stats.packets) {
> +   struct netdev_queue *txq;
> +   struct virtnet_info *vi;
> +
> +   vi = sq->vq->vdev->priv;
> +
> +   txq = netdev_get_tx_queue(vi->dev, sq - vi->sq);
> +   txq_trans_cond_update(txq);
> +   }
> +
> u64_stats_update_begin(&sq->stats.syncp);
> sq->stats.packets += stats.packets;
> sq->stats.bytes += stats.bytes;
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization