date:20240731

Re: [PATCH net-next v4 1/3] vsock: add support for SIOCOUTQ ioctl

2024-07-31 Thread Stefano Garzarella


On Tue, Jul 30, 2024 at 09:43:06PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Add support for ioctl(s) in AF_VSOCK.
The only ioctl available is SIOCOUTQ/TIOCOUTQ, which returns the number
of unsent bytes in the socket. This information is transport-specific
and is delegated to them using a callback.

Suggested-by: Daan De Meyer 
Signed-off-by: Luigi Leonardi 
---
include/net/af_vsock.h   |  3 +++
net/vmw_vsock/af_vsock.c | 58 +---
2 files changed, 58 insertions(+), 3 deletions(-)


LGTM!

Reviewed-by: Stefano Garzarella 



diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 535701efc1e5..fc504d2da3d0 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -169,6 +169,9 @@ struct vsock_transport {
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);

+   /* SIOCOUTQ ioctl */
+   ssize_t (*unsent_bytes)(struct vsock_sock *vsk);
+
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 4b040285aa78..58e639e82942 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -112,6 +112,7 @@
#include 
#include 
#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -1292,6 +1293,57 @@ int vsock_dgram_recvmsg(struct socket *sock, struct 
msghdr *msg,
}
EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);

+static int vsock_do_ioctl(struct socket *sock, unsigned int cmd,
+ int __user *arg)
+{
+   struct sock *sk = sock->sk;
+   struct vsock_sock *vsk;
+   int ret;
+
+   vsk = vsock_sk(sk);
+
+   switch (cmd) {
+   case SIOCOUTQ: {
+   ssize_t n_bytes;
+
+   if (!vsk->transport || !vsk->transport->unsent_bytes) {
+   ret = -EOPNOTSUPP;
+   break;
+   }
+
+   if (sock_type_connectible(sk->sk_type) && sk->sk_state == 
TCP_LISTEN) {
+   ret = -EINVAL;
+   break;
+   }
+
+   n_bytes = vsk->transport->unsent_bytes(vsk);
+   if (n_bytes < 0) {
+   ret = n_bytes;
+   break;
+   }
+
+   ret = put_user(n_bytes, arg);
+   break;
+   }
+   default:
+   ret = -ENOIOCTLCMD;
+   }
+
+   return ret;
+}
+
+static int vsock_ioctl(struct socket *sock, unsigned int cmd,
+  unsigned long arg)
+{
+   int ret;
+
+   lock_sock(sock->sk);
+   ret = vsock_do_ioctl(sock, cmd, (int __user *)arg);
+   release_sock(sock->sk);
+
+   return ret;
+}
+
static const struct proto_ops vsock_dgram_ops = {
.family = PF_VSOCK,
.owner = THIS_MODULE,
@@ -1302,7 +1354,7 @@ static const struct proto_ops vsock_dgram_ops = {
.accept = sock_no_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = sock_no_listen,
.shutdown = vsock_shutdown,
.sendmsg = vsock_dgram_sendmsg,
@@ -2286,7 +2338,7 @@ static const struct proto_ops vsock_stream_ops = {
.accept = vsock_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt,
@@ -2308,7 +2360,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
.accept = vsock_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt,

--
2.45.2

Re: [PATCH net-next v4 2/3] vsock/virtio: add SIOCOUTQ support for all virtio based transports

2024-07-31 Thread Stefano Garzarella


On Tue, Jul 30, 2024 at 09:43:07PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce support for virtio_transport_unsent_bytes
ioctl for virtio_transport, vhost_vsock and vsock_loopback.

For all transports the unsent bytes counter is incremented
in virtio_transport_get_credit.

In virtio_transport (G2H) and in vhost-vsock (H2G) the counter
is decremented when the skbuff is consumed. In vsock_loopback the
same skbuff is passed from the transmitter to the receiver, so
the counter is decremented before queuing the skbuff to the
receiver.

Signed-off-by: Luigi Leonardi 
---
drivers/vhost/vsock.c   |  4 +++-
include/linux/virtio_vsock.h|  6 ++
net/vmw_vsock/virtio_transport.c|  4 +++-
net/vmw_vsock/virtio_transport_common.c | 35 +
net/vmw_vsock/vsock_loopback.c  |  6 ++
5 files changed, 53 insertions(+), 2 deletions(-)



Reviewed-by: Stefano Garzarella 




diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index bf664ec9341b..802153e23073 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
restart_tx = true;
}

-   consume_skb(skb);
+   virtio_transport_consume_skb_sent(skb, true);
}
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added)
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

+   .unsent_bytes = virtio_transport_unsent_bytes,
+
.read_skb = virtio_transport_read_skb,
},

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..0387d64e2c66 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -133,6 +133,7 @@ struct virtio_vsock_sock {
u32 tx_cnt;
u32 peer_fwd_cnt;
u32 peer_buf_alloc;
+   size_t bytes_unsent;

/* Protected by rx_lock */
u32 fwd_cnt;
@@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock 
*vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk);

+ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk);
+
+void virtio_transport_consume_skb_sent(struct sk_buff *skb,
+  bool consume);
+
int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 struct vsock_sock *psk);
int
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 64a07acfef12..e0160da4ef43 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct 
*work)

virtqueue_disable_cb(vq);
while ((skb = virtqueue_get_buf(vq, &len)) != NULL) {
-   consume_skb(skb);
+   virtio_transport_consume_skb_sent(skb, true);
added = true;
}
} while (!virtqueue_enable_cb(vq));
@@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

+   .unsent_bytes = virtio_transport_unsent_bytes,
+
.read_skb = virtio_transport_read_skb,
},

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 16ff976a86e3..884ee128851e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock 
*vvs, struct sk_buff *
}
EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);

+void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume)
+{
+   struct sock *s = skb->sk;
+
+   if (s && skb->len) {
+   struct vsock_sock *vs = vsock_sk(s);
+   struct virtio_vsock_sock *vvs;
+
+   vvs = vs->trans;
+
+   spin_lock_bh(&vvs->tx_lock);
+   vvs->bytes_unsent -= skb->len;
+   spin_unlock_bh(&vvs->tx_lock);
+   }
+
+   if (consume)
+   consume_skb(skb);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent);
+
u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
{
u32 ret;
@@ -475,6 +495,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock 
*vvs, u32 credit)
if (ret > credit)
ret = credit;
vvs->tx_cnt += ret;
+

Re: [PATCH net-next v4 3/3] test/vsock: add ioctl unsent bytes test

2024-07-31 Thread Stefano Garzarella


On Tue, Jul 30, 2024 at 09:43:08PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET,
which use SIOCOUTQ ioctl to check that the number of unsent bytes is
zero after delivering a packet.

vsock_connect and vsock_accept are no longer static: this is to
create more generic tests, allowing code to be reused for SEQPACKET
and STREAM.


Yeah, good idea. We should use them for other tests as well.
(for the future)



Signed-off-by: Luigi Leonardi 
---
tools/testing/vsock/util.c   |  6 +--
tools/testing/vsock/util.h   |  3 ++
tools/testing/vsock/vsock_test.c | 85 
3 files changed, 91 insertions(+), 3 deletions(-)


LGTM and I ran them. All good :-)

Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 554b290fefdc..a3d448a075e3 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, 
unsigned int bind_po
}

/* Connect to  and return the file descriptor. */
-static int vsock_connect(unsigned int cid, unsigned int port, int type)
+int vsock_connect(unsigned int cid, unsigned int port, int type)
{
union {
struct sockaddr sa;
@@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int 
port, int type)
/* Listen on  and return the first incoming connection.  The remote
 * address is stored to clientaddrp.  clientaddrp may be NULL.
 */
-static int vsock_accept(unsigned int cid, unsigned int port,
-   struct sockaddr_vm *clientaddrp, int type)
+int vsock_accept(unsigned int cid, unsigned int port,
+struct sockaddr_vm *clientaddrp, int type)
{
union {
struct sockaddr sa;
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index e95e62485959..fff22d4a14c0 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -39,6 +39,9 @@ struct test_case {
void init_signals(void);
unsigned int parse_cid(const char *str);
unsigned int parse_port(const char *str);
+int vsock_connect(unsigned int cid, unsigned int port, int type);
+int vsock_accept(unsigned int cid, unsigned int port,
+struct sockaddr_vm *clientaddrp, int type);
int vsock_stream_connect(unsigned int cid, unsigned int port);
int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index f851f8961247..8d38dbf8f41f 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -20,6 +20,8 @@
#include 
#include 
#include 
+#include 
+#include 

#include "vsock_test_zerocopy.h"
#include "timeout.h"
@@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct 
test_opts *opts)
}
}

+#define MSG_BUF_IOCTL_LEN 64
+static void test_unsent_bytes_server(const struct test_opts *opts, int type)
+{
+   unsigned char buf[MSG_BUF_IOCTL_LEN];
+   int client_fd;
+
+   client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type);
+   if (client_fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf));
+   control_writeln("RECEIVED");
+
+   close(client_fd);
+}
+
+static void test_unsent_bytes_client(const struct test_opts *opts, int type)
+{
+   unsigned char buf[MSG_BUF_IOCTL_LEN];
+   int ret, fd, sock_bytes_unsent;
+
+   fd = vsock_connect(opts->peer_cid, opts->peer_port, type);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   for (int i = 0; i < sizeof(buf); i++)
+   buf[i] = rand() & 0xFF;
+
+   send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
+   control_expectln("RECEIVED");
+
+   ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
+   if (ret < 0) {
+   if (errno == EOPNOTSUPP) {
+   fprintf(stderr, "Test skipped, SIOCOUTQ not 
supported.\n");
+   } else {
+   perror("ioctl");
+   exit(EXIT_FAILURE);
+   }
+   } else if (ret == 0 && sock_bytes_unsent != 0) {
+   fprintf(stderr,
+   "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n",
+   sock_bytes_unsent);
+   exit(EXIT_FAILURE);
+   }
+
+   close(fd);
+}
+
+static void test_stream_unsent_bytes_client(const struct test_opts *opts)
+{
+   test_unsent_bytes_client(opts, SOCK_STREAM);
+}
+
+static void test_stream_unsent_bytes_server(const struct test_opts *opts)
+{
+   test_unsent_bytes_server(opts, SOCK_STREAM);
+}
+
+static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts)
+{
+   test_un

Re: [PATCH net-next v4 2/2] vsock/virtio: avoid queuing packets when intermediate queue is empty

2024-07-31 Thread Stefano Garzarella


On Tue, Jul 30, 2024 at 09:47:32PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

When the driver needs to send new packets to the device, it always
queues the new sk_buffs into an intermediate queue (send_pkt_queue)
and schedules a worker (send_pkt_work) to then queue them into the
virtqueue exposed to the device.

This increases the chance of batching, but also introduces a lot of
latency into the communication. So we can optimize this path by
adding a fast path to be taken when there is no element in the
intermediate queue, there is space available in the virtqueue,
and no other process that is sending packets (tx_lock held).

The following benchmarks were run to check improvements in latency and
throughput. The test bed is a host with Intel i7-10700KF CPU @ 3.80GHz
and L1 guest running on QEMU/KVM with vhost process and all vCPUs
pinned individually to pCPUs.

- Latency
  Tool: Fio version 3.37-56
  Mode: pingpong (h-g-h)
  Test runs: 50
  Runtime-per-test: 50s
  Type: SOCK_STREAM

In the following fio benchmark (pingpong mode) the host sends
a payload to the guest and waits for the same payload back.

fio process pinned both inside the host and the guest system.

Before: Linux 6.9.8

Payload 64B:

1st perc.   overall 99th perc.
Before  12.91   16.78   42.24   us
After   9.7713.57   39.17   us

Payload 512B:

1st perc.   overall 99th perc.
Before  13.35   17.35   41.52   us
After   10.25   14.11   39.58   us

Payload 4K:

1st perc.   overall 99th perc.
Before  14.71   19.87   41.52   us
After   10.51   14.96   40.81   us

- Throughput
  Tool: iperf-vsock

The size represents the buffer length (-l) to read/write
P represents the number of parallel streams

P=1
4K  64K 128K
Before  6.8729.329.5 Gb/s
After   10.539.439.9 Gb/s

P=2
4K  64K 128K
Before  10.532.833.2 Gb/s
After   17.847.748.5 Gb/s

P=4
4K  64K 128K
Before  12.733.634.2 Gb/s
After   16.948.150.5 Gb/s


Great improvement! Thanks again for this work!



The performance improvement is related to this optimization,
I used a ebpf kretprobe on virtio_transport_send_skb to check
that each packet was sent directly to the virtqueue

Co-developed-by: Marco Pinna 
Signed-off-by: Marco Pinna 
Signed-off-by: Luigi Leonardi 
---
net/vmw_vsock/virtio_transport.c | 39 +++
1 file changed, 35 insertions(+), 4 deletions(-)


All my comments have been resolved. I let iperf run bidirectionally for 
a long time and saw no problems, so:


Reviewed-by: Stefano Garzarella 




diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index f641e906f351..f992f9a216f0 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -208,6 +208,28 @@ virtio_transport_send_pkt_work(struct work_struct *work)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}

+/* Caller need to hold RCU for vsock.
+ * Returns 0 if the packet is successfully put on the vq.
+ */
+static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, 
struct sk_buff *skb)
+{
+   struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+   int ret;
+
+   /* Inside RCU, can't sleep! */
+   ret = mutex_trylock(&vsock->tx_lock);
+   if (unlikely(ret == 0))
+   return -EBUSY;
+
+   ret = virtio_transport_send_skb(skb, vq, vsock);
+   if (ret == 0)
+   virtqueue_kick(vq);
+
+   mutex_unlock(&vsock->tx_lock);
+
+   return ret;
+}
+
static int
virtio_transport_send_pkt(struct sk_buff *skb)
{
@@ -231,11 +253,20 @@ virtio_transport_send_pkt(struct sk_buff *skb)
goto out_rcu;
}

-   if (virtio_vsock_skb_reply(skb))
-   atomic_inc(&vsock->queued_replies);
+   /* If send_pkt_queue is empty, we can safely bypass this queue
+* because packet order is maintained and (try) to put the packet
+* on the virtqueue using virtio_transport_send_skb_fast_path.
+* If this fails we simply put the packet on the intermediate
+* queue and schedule the worker.
+*/
+   if (!skb_queue_empty_lockless(&vsock->send_pkt_queue) ||
+   virtio_transport_send_skb_fast_path(vsock, skb)) {
+   if (virtio_vsock_skb_reply(skb))
+   atomic_inc(&vsock->queued_replies);

-   virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-   queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+   virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
+   queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+   }

out_rcu:
rcu_read_unlock();

--
2.45.2

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Oleg Nesterov

On 07/31, Masami Hiramatsu wrote:
>
> On Mon, 29 Jul 2024 15:45:35 +0200
> Oleg Nesterov  wrote:
>
> > This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *"
> > rather than inode + offset. This simplifies the code and allows to avoid
> > the unnecessary find_uprobe() + put_uprobe() in these functions.
> >
> > TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that
> > this uprobe can't be freed before up_write(&uprobe->register_rwsem).
>
> Is this TODO item, or just a note? At this moment, this is natural
> to use get_uprobe() to protect uprobe itself.

3/3 from the next series removes the extra get_uprobe() + put_uprobe().

Initially the change said something like

This patch adds the additional get_uprobe/put_uprobe into _register,
the next patch will remove this.

But then decided to split this "next" patch and send it in another series.

Thanks,

Oleg.

[PATCH] tracing: Replace strncpy() with strscpy() when copying comm

2024-07-31 Thread Jinjie Ruan

Replace the depreciated[1] strncpy() calls with strscpy()
when copying comm.

Link: https://github.com/KSPP/linux/issues/90 [1]
Signed-off-by: Jinjie Ruan 
---
 kernel/trace/trace.c  | 2 +-
 kernel/trace/trace_events_hist.c  | 4 ++--
 kernel/trace/trace_sched_switch.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d0af984a5337..73cfdc704eec 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1907,7 +1907,7 @@ __update_max_tr(struct trace_array *tr, struct 
task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
 
-   strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+   strscpy(max_data->comm, tsk->comm);
max_data->pid = tsk->pid;
/*
 * If tsk == current, then use current_uid(), as that does not use
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ece1308d36a..4ee0e64719fa 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1599,7 +1599,7 @@ static inline void save_comm(char *comm, struct 
task_struct *task)
return;
}
 
-   strncpy(comm, task->comm, TASK_COMM_LEN);
+   strscpy(comm, task->comm);
 }
 
 static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -3405,7 +3405,7 @@ static bool cond_snapshot_update(struct trace_array *tr, 
void *cond_data)
elt_data = context->elt->private_data;
track_elt_data = track_data->elt.private_data;
if (elt_data->comm)
-   strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+   strscpy(track_elt_data->comm, elt_data->comm);
 
track_data->updated = true;
 
diff --git a/kernel/trace/trace_sched_switch.c 
b/kernel/trace/trace_sched_switch.c
index 8a407adb0e1c..573b5d8e8a28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -187,7 +187,7 @@ static inline char *get_saved_cmdlines(int idx)
 
 static inline void set_cmdline(int idx, const char *cmdline)
 {
-   strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+   strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
 }
 
 static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-- 
2.34.1

Re: [PATCH v2 4/5] uprobes: kill uprobe_register_refctr()

2024-07-31 Thread Oleg Nesterov

On 07/31, Masami Hiramatsu wrote:
>
> OK, but it seems
>
> tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
>
> is still using uprobe_register_refctr().
>
> That should be updated too.

OOPS, thanks a lot :/

I'll send v3 with the additional change below in reply to 4/5 in a minute.

Masami, Peter, please let me know if you want me to resend the whole series.

Oleg.

--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -458,8 +458,8 @@ static int testmod_register_uprobe(loff_t offset)
if (err)
goto out;
 
-   err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
-offset, 0, &uprobe.consumer);
+   err = uprobe_register(d_real_inode(uprobe.path.dentry),
+ offset, 0, &uprobe.consumer);
if (err)
path_put(&uprobe.path);
else

[PATCH v3 4/5] uprobes: kill uprobe_register_refctr()

2024-07-31 Thread Oleg Nesterov

It doesn't make any sense to have 2 versions of _register(). Note that
trace_uprobe_enable(), the only user of uprobe_register(), doesn't need
to check tu->ref_ctr_offset to decide which one should be used, it could
safely pass ref_ctr_offset == 0 to uprobe_register_refctr().

Add this argument to uprobe_register(), update the callers, and kill
uprobe_register_refctr().

Signed-off-by: Oleg Nesterov 
Acked-by: Andrii Nakryiko 
---
 include/linux/uprobes.h   |  9 ++-
 kernel/events/uprobes.c   | 24 +--
 kernel/trace/bpf_trace.c  |  8 +++
 kernel/trace/trace_uprobe.c   |  7 +-
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   |  4 ++--
 5 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b503fafb7fb3..440316fbf3c6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -110,8 +110,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
*mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
-extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t 
ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_register(struct inode *inode, loff_t offset, loff_t 
ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
@@ -152,11 +151,7 @@ static inline void uprobes_init(void)
 #define uprobe_get_trap_addr(regs) instruction_pointer(regs)
 
 static inline int
-uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
-   return -ENOSYS;
-}
-static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, 
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, 
struct uprobe_consumer *uc)
 {
return -ENOSYS;
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dfe6306a63b1..b7f40bad8abc 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1121,25 +1121,26 @@ void uprobe_unregister(struct inode *inode, loff_t 
offset, struct uprobe_consume
 EXPORT_SYMBOL_GPL(uprobe_unregister);
 
 /*
- * __uprobe_register - register a probe
+ * uprobe_register - register a probe
  * @inode: the file in which the probe has to be placed.
  * @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
  * @uc: information on howto handle the probe..
  *
- * Apart from the access refcount, __uprobe_register() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
  * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
  * inserted into the rbtree (i.e first consumer for a @inode:@offset
  * tuple).  Creation refcount stops uprobe_unregister from freeing the
  * @uprobe even before the register operation is complete. Creation
  * refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of __uprobe_register() is required to keep @inode
+ * unregisters. Caller of uprobe_register() is required to keep @inode
  * (and the containing mount) referenced.
  *
  * Return errno if it cannot successully install probes
  * else return 0 (success)
  */
-static int __uprobe_register(struct inode *inode, loff_t offset,
-loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset,
+   struct uprobe_consumer *uc)
 {
struct uprobe *uprobe;
int ret;
@@ -1189,21 +1190,8 @@ static int __uprobe_register(struct inode *inode, loff_t 
offset,
goto retry;
return ret;
 }
-
-int uprobe_register(struct inode *inode, loff_t offset,
-   struct uprobe_consumer *uc)
-{
-   return __uprobe_register(inode, offset, 0, uc);
-}
 EXPORT_SYMBOL_GPL(uprobe_register);
 
-int uprobe_register_refctr(struct inode *inode, loff_t offset,
-  loff_t ref_ctr_offset, struct uprobe_consumer *uc)
-{
-   return __uprobe_register(inode, offset, ref_ctr_offset, uc);
-}
-EXPORT_SYMBOL_GPL(uprobe_register_refctr);
-
 /*
  * uprobe_apply - unregister an already registered probe.
  * @inode: the file in which the probe has to be removed.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cd098846e251..afa909e17824 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@

[PATCH v2 0/6] rtla: Support idle state disabling via libcpupower in timerlat

2024-07-31 Thread tglozar

From: Tomas Glozar 

rtla-timerlat allows reducing latency on wake up from idle by setting
/dev/cpu_dma_latency during the timerlat measurement. This has an effect on
the idle states of all CPUs, including those which are not used by timerlat.

Add option --deepest-idle-state that allows limiting the idle state only on cpus
where the timerlat measurement is running.

libcpupower is used to do the disabling of idle states via the corresponding
sysfs interface.

v2:
- Split patch adding dependency on libcpupower to two patches, one for
libcpupower detection and one for rtla libcpupower dependency.
- Make building against libcpupower optional. rtla will throw an error
when built without libcpupower and --deepest-idle-state is used.
- Rename option from --disable-idle-states to --deepest-idle-state and
add an argument to choose the deepest idle state the CPU is allowed to
get into. -1 can be used to disable all idle states: this is useful on
non-ACPI platforms, where idle state 0 can be an actual idle state with
an exit latency rather than a representation of an active CPU, as with the
ACPI C0 state.

Note: It is also possible to retrieve the latency for individual idle states
of a cpu by calling cpuidle_state_latency. This could be used to implement
another rtla option that would take the maximum latency, like --dma-latency
does, and which would only take effect on CPUs used by timerlat.

My opinion is that this proposed feature should not replace either
--dma-latency nor --deepest-idle-state. For the former, there might be
systems which have /dev/cpu_dma_latency but don't have a cpuidle
implementation; for the latter, in many cases the user will want to set
the idle state rather than the latency itself.

Tomas Glozar (6):
  tools/build: Add libcpupower dependency detection
  rtla: Add optional dependency on libcpupower
  rtla/utils: Add idle state disabling via libcpupower
  rtla/timerlat: Add --deepest-idle-state for top
  rtla/timerlat: Add --deepest-idle-state for hist
  rtla: Documentation: Mention --deepest-idle-state

 .../tools/rtla/common_timerlat_options.rst|   8 +
 tools/build/Makefile.feature  |   1 +
 tools/build/feature/Makefile  |   4 +
 tools/tracing/rtla/Makefile   |   2 +
 tools/tracing/rtla/Makefile.config|  10 ++
 tools/tracing/rtla/README.txt |   4 +
 tools/tracing/rtla/src/timerlat_hist.c|  46 +-
 tools/tracing/rtla/src/timerlat_top.c |  46 +-
 tools/tracing/rtla/src/utils.c| 140 ++
 tools/tracing/rtla/src/utils.h|   6 +
 10 files changed, 265 insertions(+), 2 deletions(-)

-- 
2.45.2

[PATCH v2 1/6] tools/build: Add libcpupower dependency detection

2024-07-31 Thread tglozar

From: Tomas Glozar 

Add the ability to detect the presence of libcpupower on a system to
the Makefiles in tools/build.

Signed-off-by: Tomas Glozar 
---
 tools/build/Makefile.feature | 1 +
 tools/build/feature/Makefile | 4 
 2 files changed, 5 insertions(+)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 1e2ab148d5db..e4fb0a1fbddf 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -53,6 +53,7 @@ FEATURE_TESTS_BASIC :=  \
 libslang-include-subdir \
 libtraceevent   \
 libtracefs  \
+libcpupower \
 libcrypto   \
 libunwind   \
 pthread-attr-setaffinity-np \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 489cbed7e82a..c4a78333660b 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -38,6 +38,7 @@ FILES=  \
  test-libslang.bin  \
  test-libslang-include-subdir.bin   \
  test-libtraceevent.bin \
+ test-libcpupower.bin   \
  test-libtracefs.bin\
  test-libcrypto.bin \
  test-libunwind.bin \
@@ -212,6 +213,9 @@ $(OUTPUT)test-libslang-include-subdir.bin:
 $(OUTPUT)test-libtraceevent.bin:
$(BUILD) -ltraceevent
 
+$(OUTPUT)test-libcpupower.bin:
+   $(BUILD) -lcpupower
+
 $(OUTPUT)test-libtracefs.bin:
 $(BUILD) $(shell $(PKG_CONFIG) --cflags libtracefs 2>/dev/null) 
-ltracefs
 
-- 
2.45.2

[PATCH v2 2/6] rtla: Add optional dependency on libcpupower

2024-07-31 Thread tglozar

From: Tomas Glozar 

If libcpupower is present, set HAVE_LIBCPUPOWER_SUPPORT macro to allow
features depending on libcpupower in rtla.

Signed-off-by: Tomas Glozar 
---
 tools/tracing/rtla/Makefile|  2 ++
 tools/tracing/rtla/Makefile.config | 10 ++
 2 files changed, 12 insertions(+)

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index b5878be36125..a6a7dee16622 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -32,8 +32,10 @@ DOCSRC   := ../../../Documentation/tools/rtla/
 
 FEATURE_TESTS  := libtraceevent
 FEATURE_TESTS  += libtracefs
+FEATURE_TESTS  += libcpupower
 FEATURE_DISPLAY:= libtraceevent
 FEATURE_DISPLAY+= libtracefs
+FEATURE_DISPLAY+= libcpupower
 
 ifeq ($(V),1)
   Q=
diff --git a/tools/tracing/rtla/Makefile.config 
b/tools/tracing/rtla/Makefile.config
index 0b7ecfb30d19..5f6f537c9728 100644
--- a/tools/tracing/rtla/Makefile.config
+++ b/tools/tracing/rtla/Makefile.config
@@ -42,6 +42,16 @@ else
   $(info libtracefs is missing. Please install libtracefs-dev/libtracefs-devel)
 endif
 
+$(call feature_check,libcpupower)
+ifeq ($(feature-libcpupower), 1)
+  $(call detected,CONFIG_LIBCPUPOWER)
+  $(call lib_setup,cpupower)
+  CFLAGS += -DHAVE_LIBCPUPOWER_SUPPORT
+else
+  $(info libcpupower is missing, building without --deepest-idle-state 
support.)
+  $(info Please install libcpupower-dev/kernel-tools-libs-devel)
+endif
+
 ifeq ($(STOP_ERROR),1)
   $(error Please, check the errors above.)
 endif
-- 
2.45.2

[PATCH v2 4/6] rtla/timerlat: Add --deepest-idle-state for top

2024-07-31 Thread tglozar

From: Tomas Glozar 

Add option to limit deepest idle state on CPUs where timerlat is running
for the duration of the workload.

Signed-off-by: Tomas Glozar 
---
 tools/tracing/rtla/src/timerlat_top.c | 46 ++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/tools/tracing/rtla/src/timerlat_top.c 
b/tools/tracing/rtla/src/timerlat_top.c
index 8c16419fe22a..ef1d3affef95 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -48,6 +48,7 @@ struct timerlat_top_params {
int pretty_output;
int warmup;
int buffer_size;
+   int deepest_idle_state;
cpu_set_t   hk_cpu_set;
struct sched_attr   sched_param;
struct trace_events *events;
@@ -447,7 +448,7 @@ static void timerlat_top_usage(char *usage)
"",
"  usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] 
[-n] [-p us] [-i us] [-T us] [-s us] \\",
" [[-t[file]] [-e sys[:event]] [--filter ] 
[--trigger ] [-c cpu-list] [-H cpu-list]\\",
-   " [-P priority] [--dma-latency us] [--aa-only us] 
[-C[=cgroup_name]] [-u|-k] [--warm-up s]",
+   " [-P priority] [--dma-latency us] [--aa-only us] 
[-C[=cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]",
"",
" -h/--help: print this menu",
" -a/--auto: set automatic trace mode, stopping the 
session if argument in us latency is hit",
@@ -481,6 +482,7 @@ static void timerlat_top_usage(char *usage)
" -U/--user-load: enable timerlat for user-defined 
user-space workload",
"--warm-up s: let the workload run for s seconds 
before collecting data",
"--trace-buffer-size kB: set the per-cpu trace 
buffer size in kB",
+   "--deepest-idle-state n: only go down to idle state 
n on cpus used by timerlat to reduce exit from idle latency",
NULL,
};
 
@@ -518,6 +520,9 @@ static struct timerlat_top_params
/* disabled by default */
params->dma_latency = -1;
 
+   /* disabled by default */
+   params->deepest_idle_state = -2;
+
/* display data in microseconds */
params->output_divisor = 1000;
 
@@ -550,6 +555,7 @@ static struct timerlat_top_params
{"aa-only", required_argument,  0, '5'},
{"warm-up", required_argument,  0, '6'},
{"trace-buffer-size",   required_argument,  0, '7'},
+   {"deepest-idle-state",  required_argument,  0, '8'},
{0, 0, 0, 0}
};
 
@@ -726,6 +732,9 @@ static struct timerlat_top_params
case '7':
params->buffer_size = get_llong_from_str(optarg);
break;
+   case '8':
+   params->deepest_idle_state = get_llong_from_str(optarg);
+   break;
default:
timerlat_top_usage("Invalid option");
}
@@ -922,6 +931,9 @@ int timerlat_top_main(int argc, char *argv[])
int return_value = 1;
char *max_lat;
int retval;
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+   int i;
+#endif /* HAVE_LIBCPUPOWER_SUPPORT */
 
params = timerlat_top_parse_args(argc, argv);
if (!params)
@@ -971,6 +983,26 @@ int timerlat_top_main(int argc, char *argv[])
}
}
 
+   if (params->deepest_idle_state >= -1) {
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+   for (i = 0; i < sysconf(_SC_NPROCESSORS_CONF); i++) {
+   if (params->cpus && !CPU_ISSET(i, 
¶ms->monitored_cpus))
+   continue;
+   if (save_cpu_idle_disable_state(i) < 0) {
+   err_msg("Could not save cpu idle state.\n");
+   goto out_free;
+   }
+   if (set_deepest_cpu_idle_state(i, 
params->deepest_idle_state) < 0) {
+   err_msg("Could not set deepest cpu idle 
state.\n");
+   goto out_free;
+   }
+   }
+#else
+   err_msg("rtla built without libcpupower, --deepest-idle-state 
is not supported\n");
+   goto out_free;
+#endif /* HAVE_LIBCPUPOWER_SUPPORT */
+   }
+
if (params->trace_output) {
record = osnoise_init_trace_tool("timerlat");
if (!record) {
@@ -1125,6 +1157,15 @@ int timerlat_top_main(int argc, char *argv[])
timerlat_aa_destroy();
if (dma_latency_fd >= 0)
close(dma_latency_fd);
+#ifdef HAVE_LIBCPUPOWE

[PATCH v2 3/6] rtla/utils: Add idle state disabling via libcpupower

2024-07-31 Thread tglozar

From: Tomas Glozar 

Add functions to utils.c to disable idle states through functions of
libcpupower. This will serve as the basis for disabling idle states
per cpu when running timerlat.

Signed-off-by: Tomas Glozar 
---
 tools/tracing/rtla/src/utils.c | 140 +
 tools/tracing/rtla/src/utils.h |   6 ++
 2 files changed, 146 insertions(+)

diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 9ac71a66840c..9279b8ce08c3 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -4,6 +4,9 @@
  */
 
 #define _GNU_SOURCE
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+#include 
+#endif /* HAVE_LIBCPUPOWER_SUPPORT */
 #include 
 #include 
 #include 
@@ -519,6 +522,143 @@ int set_cpu_dma_latency(int32_t latency)
return fd;
 }
 
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+static unsigned int **saved_cpu_idle_disable_state;
+static size_t saved_cpu_idle_disable_state_alloc_ctr;
+
+/*
+ * save_cpu_idle_state_disable - save disable for all idle states of a cpu
+ *
+ * Saves the current disable of all idle states of a cpu, to be subsequently
+ * restored via restore_cpu_idle_disable_state.
+ *
+ * Return: idle state count on success, negative on error
+ */
+int save_cpu_idle_disable_state(unsigned int cpu)
+{
+   unsigned int nr_states;
+   unsigned int state;
+   int disabled;
+   int nr_cpus;
+
+   nr_states = cpuidle_state_count(cpu);
+
+   if (nr_states == 0)
+   return 0;
+
+   if (saved_cpu_idle_disable_state == NULL) {
+   nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+   saved_cpu_idle_disable_state = calloc(nr_cpus, sizeof(unsigned 
int *));
+   }
+
+   saved_cpu_idle_disable_state[cpu] = calloc(nr_states, sizeof(unsigned 
int));
+   saved_cpu_idle_disable_state_alloc_ctr++;
+
+   for (state = 0; state < nr_states; state++) {
+   disabled = cpuidle_is_state_disabled(cpu, state);
+   if (disabled < 0)
+   return disabled;
+   saved_cpu_idle_disable_state[cpu][state] = disabled;
+   }
+
+   return nr_states;
+}
+
+/*
+ * restore_cpu_idle_disable_state - restore disable for all idle states of a 
cpu
+ *
+ * Restores the current disable state of all idle states of a cpu that was
+ * previously saved by save_cpu_idle_disable_state.
+ *
+ * Return: idle state count on success, negative on error
+ */
+int restore_cpu_idle_disable_state(unsigned int cpu)
+{
+   unsigned int nr_states;
+   unsigned int state;
+   int disabled;
+   int result;
+
+   nr_states = cpuidle_state_count(cpu);
+
+   if (nr_states == 0)
+   return 0;
+
+   for (state = 0; state < nr_states; state++) {
+   disabled = saved_cpu_idle_disable_state[cpu][state];
+   result = cpuidle_state_disable(cpu, state, disabled);
+   if (result < 0)
+   return result;
+   }
+
+   free(saved_cpu_idle_disable_state[cpu]);
+   saved_cpu_idle_disable_state[cpu] = NULL;
+   saved_cpu_idle_disable_state_alloc_ctr--;
+   if (saved_cpu_idle_disable_state_alloc_ctr == 0) {
+   free(saved_cpu_idle_disable_state);
+   saved_cpu_idle_disable_state = NULL;
+   }
+
+   return nr_states;
+}
+
+/*
+ * free_cpu_idle_disable_states - free saved idle state disable for all cpus
+ *
+ * Frees the memory used for storing cpu idle state disable for all cpus
+ * and states.
+ *
+ * Normally, the memory is freed automatically in
+ * restore_cpu_idle_disable_state; this is mostly for cleaning up after an
+ * error.
+ */
+void free_cpu_idle_disable_states(void)
+{
+   int cpu;
+
+   if (!saved_cpu_idle_disable_state)
+   return;
+
+   for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
+   if (!saved_cpu_idle_disable_state[cpu])
+   continue;
+   free(saved_cpu_idle_disable_state[cpu]);
+   saved_cpu_idle_disable_state[cpu] = NULL;
+   }
+
+   free(saved_cpu_idle_disable_state);
+   saved_cpu_idle_disable_state = NULL;
+}
+
+/*
+ * set_deepest_cpu_idle_state - limit idle state of cpu
+ *
+ * Disables all idle states deeper than the one given in
+ * deepest_state (assuming states with higher number are deeper).
+ *
+ * This is used to reduce the exit from idle latency. Unlike
+ * set_cpu_dma_latency, it can disable idle states per cpu.
+ *
+ * Return: idle state count on success, negative on error
+ */
+int set_deepest_cpu_idle_state(unsigned int cpu, unsigned int deepest_state)
+{
+   unsigned int nr_states;
+   unsigned int state;
+   int result;
+
+   nr_states = cpuidle_state_count(cpu);
+
+   for (state = deepest_state + 1; state < nr_states; state++) {
+   result = cpuidle_state_disable(cpu, state, 1);
+   if (result < 0)
+   return result;
+   }
+
+   return nr

[PATCH v2 6/6] rtla: Documentation: Mention --deepest-idle-state

2024-07-31 Thread tglozar

From: Tomas Glozar 

Add --deepest-idle-state to manpage and mention libcpupower dependency
in README.txt.

Signed-off-by: Tomas Glozar 
---
 Documentation/tools/rtla/common_timerlat_options.rst | 8 
 tools/tracing/rtla/README.txt| 4 
 2 files changed, 12 insertions(+)

diff --git a/Documentation/tools/rtla/common_timerlat_options.rst 
b/Documentation/tools/rtla/common_timerlat_options.rst
index cef6651f1435..10dc802f8d65 100644
--- a/Documentation/tools/rtla/common_timerlat_options.rst
+++ b/Documentation/tools/rtla/common_timerlat_options.rst
@@ -31,6 +31,14 @@
 *cyclictest* sets this value to *0* by default, use **--dma-latency** 
*0* to have
 similar results.
 
+**--deepest-idle-state** *n*
+Disable idle states higher than *n* for cpus that are running timerlat 
threads to
+reduce exit from idle latencies. If *n* is -1, all idle states are 
disabled.
+On exit from timerlat, the idle state setting is restored to its 
original state
+before running timerlat.
+
+Requires rtla to be built with libcpupower.
+
 **-k**, **--kernel-threads**
 
 Use timerlat kernel-space threads, in contrast of **-u**.
diff --git a/tools/tracing/rtla/README.txt b/tools/tracing/rtla/README.txt
index 4af3fd40f171..dd5621038c55 100644
--- a/tools/tracing/rtla/README.txt
+++ b/tools/tracing/rtla/README.txt
@@ -11,6 +11,7 @@ RTLA depends on the following libraries and tools:
 
  - libtracefs
  - libtraceevent
+ - libcpupower (optional, for --deepest-idle-state)
 
 It also depends on python3-docutils to compile man pages.
 
@@ -26,6 +27,9 @@ For development, we suggest the following steps for compiling 
rtla:
   $ make
   $ sudo make install
   $ cd ..
+  $ cd $libcpupower_src
+  $ make
+  $ sudo make install
   $ cd $rtla_src
   $ make
   $ sudo make install
-- 
2.45.2

[PATCH v2 5/6] rtla/timerlat: Add --deepest-idle-state for hist

2024-07-31 Thread tglozar

From: Tomas Glozar 

Support limiting deepest idle state also for timerlat-hist.

Signed-off-by: Tomas Glozar 
---
 tools/tracing/rtla/src/timerlat_hist.c | 46 +-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/tools/tracing/rtla/src/timerlat_hist.c 
b/tools/tracing/rtla/src/timerlat_hist.c
index a3907c390d67..41cf6a0535b4 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -55,6 +55,7 @@ struct timerlat_hist_params {
int entries;
int warmup;
int buffer_size;
+   int deepest_idle_state;
 };
 
 struct timerlat_hist_cpu {
@@ -655,7 +656,7 @@ static void timerlat_hist_usage(char *usage)
" [-t[file]] [-e sys[:event]] [--filter ] 
[--trigger ] [-c cpu-list] [-H cpu-list]\\",
" [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] 
[--no-header] [--no-summary] \\",
" [--no-index] [--with-zeros] [--dma-latency us] 
[-C[=cgroup_name]] [--no-aa] [--dump-task] [-u|-k]",
-   " [--warm-up s]",
+   " [--warm-up s] [--deepest-idle-state n]",
"",
" -h/--help: print this menu",
" -a/--auto: set automatic trace mode, stopping the 
session if argument in us latency is hit",
@@ -695,6 +696,7 @@ static void timerlat_hist_usage(char *usage)
" -U/--user-load: enable timerlat for user-defined 
user-space workload",
"--warm-up s: let the workload run for s seconds 
before collecting data",
"--trace-buffer-size kB: set the per-cpu trace 
buffer size in kB",
+   "--deepest-idle-state n: only go down to idle state 
n on cpus used by timerlat to reduce exit from idle latency",
NULL,
};
 
@@ -732,6 +734,9 @@ static struct timerlat_hist_params
/* disabled by default */
params->dma_latency = -1;
 
+   /* disabled by default */
+   params->deepest_idle_state = -2;
+
/* display data in microseconds */
params->output_divisor = 1000;
params->bucket_size = 1;
@@ -772,6 +777,7 @@ static struct timerlat_hist_params
{"dump-task",   no_argument,0, 
'\1'},
{"warm-up", required_argument,  0, 
'\2'},
{"trace-buffer-size",   required_argument,  0, 
'\3'},
+   {"deepest-idle-state",  required_argument,  0, 
'\4'},
{0, 0, 0, 0}
};
 
@@ -960,6 +966,9 @@ static struct timerlat_hist_params
case '\3':
params->buffer_size = get_llong_from_str(optarg);
break;
+   case '\4':
+   params->deepest_idle_state = get_llong_from_str(optarg);
+   break;
default:
timerlat_hist_usage("Invalid option");
}
@@ -1152,6 +1161,9 @@ int timerlat_hist_main(int argc, char *argv[])
int return_value = 1;
pthread_t timerlat_u;
int retval;
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+   int i;
+#endif /* HAVE_LIBCPUPOWER_SUPPORT */
 
params = timerlat_hist_parse_args(argc, argv);
if (!params)
@@ -1201,6 +1213,26 @@ int timerlat_hist_main(int argc, char *argv[])
}
}
 
+   if (params->deepest_idle_state >= -1) {
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+   for (i = 0; i < sysconf(_SC_NPROCESSORS_CONF); i++) {
+   if (params->cpus && !CPU_ISSET(i, 
¶ms->monitored_cpus))
+   continue;
+   if (save_cpu_idle_disable_state(i) < 0) {
+   err_msg("Could not save cpu idle state.\n");
+   goto out_free;
+   }
+   if (set_deepest_cpu_idle_state(i, 
params->deepest_idle_state) < 0) {
+   err_msg("Could not set deepest cpu idle 
state.\n");
+   goto out_free;
+   }
+   }
+#else
+   err_msg("rtla built without libcpupower, --deepest-idle-state 
is not supported\n");
+   goto out_free;
+#endif /* HAVE_LIBCPUPOWER_SUPPORT */
+   }
+
if (params->trace_output) {
record = osnoise_init_trace_tool("timerlat");
if (!record) {
@@ -1332,6 +1364,15 @@ int timerlat_hist_main(int argc, char *argv[])
timerlat_aa_destroy();
if (dma_latency_fd >= 0)
close(dma_latency_fd);
+#ifdef HAVE_LIBCPUPOWER_SUPPORT
+   if (params->deepest_idle_state >= -1) {
+   for (i = 0; i < sysconf(_SC_NPROCESSORS_CONF); i++) {
+

Re: [PATCH 0/3] uprobes: simplify _unregister paths

2024-07-31 Thread Google

On Tue, 30 Jul 2024 14:34:21 +0200
Oleg Nesterov  wrote:

> On top of
> 
>   [PATCH v2 0/5] uprobes: misc cleanups/simplifications
>   https://lore.kernel.org/all/2024072913.ga12...@redhat.com/
> 
> I sent yesterday.
> 

OK, this series looks good to me.

Acked-by: Masami Hiramatsu (Google) 

Thanks,

> Oleg.
> ---
> 
>  kernel/events/uprobes.c | 47 ---
>  1 file changed, 24 insertions(+), 23 deletions(-)
> 


-- 
Masami Hiramatsu (Google)

[PATCH] uprobes: Remove redundant spinlock in uprobe_deny_signal

2024-07-31 Thread Liao Chang

Since clearing a bit in thread_info is an atomic operation, the spinlock
is redundant and can be removed, reducing lock contention is good for
performance.

Signed-off-by: Liao Chang 
---
 kernel/events/uprobes.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 73cc47708679..76a51a1f51e2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1979,9 +1979,7 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
 
if (task_sigpending(t)) {
-   spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
-   spin_unlock_irq(&t->sighand->siglock);
 
if (__fatal_signal_pending(t) || 
arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
-- 
2.34.1

[PATCH] LoongArch: Revert qspinlock to test-and-set on VMs

2024-07-31 Thread Bibo Mao

Similar with x86, when VM is detected, revert to a simple test-and-set
lock to avoid the horrors of queue preemption.

Tested on 3C5000 Dual-way machine with 32 cores and 2 numa nodes,
test case is kcbench on kernel mainline 6.10, the detailed command is
"kcbench --src /root/src/linux"

Performance on host machine
  kernel compile time   performance impact
   Original   150.29 seconds
   With patch 150.19 secondsalmost no impact

Performance on virtual machine:
1. 1 VM with 32 vCPUs and 2 numa node, numa node pinned
  kernel compile time   performance impact
   Original   170.87 seconds
   With patch 171.73 secondsalmost no impact

2. 2 VMs, each VM with 32 vCPUs and 2 numa node, numa node pinned
  kernel compile time   performance impact
   Original   2362.04 seconds
   With patch 354.73  seconds+565%

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/Kbuild  |  1 -
 arch/loongarch/include/asm/paravirt.h  |  3 ++
 arch/loongarch/include/asm/qspinlock.h | 41 ++
 arch/loongarch/kernel/paravirt.c   |  9 ++
 arch/loongarch/kernel/setup.c  |  5 
 arch/loongarch/kernel/smp.c|  2 ++
 6 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 arch/loongarch/include/asm/qspinlock.h

diff --git a/arch/loongarch/include/asm/Kbuild 
b/arch/loongarch/include/asm/Kbuild
index 2bb3676429c0..4635b755b2b4 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -6,7 +6,6 @@ generic-y += mcs_spinlock.h
 generic-y += parport.h
 generic-y += early_ioremap.h
 generic-y += qrwlock.h
-generic-y += qspinlock.h
 generic-y += user.h
 generic-y += ioctl.h
 generic-y += statfs.h
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
index dddec49671ae..dcc2b46d31fe 100644
--- a/arch/loongarch/include/asm/paravirt.h
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -19,6 +19,7 @@ static inline u64 paravirt_steal_clock(int cpu)
 
 int __init pv_ipi_init(void);
 int __init pv_time_init(void);
+void __init pv_spinlock_init(void);
 
 #else
 
@@ -31,5 +32,7 @@ static inline int pv_time_init(void)
 {
return 0;
 }
+
+static inline void pv_spinlock_init(void) { }
 #endif // CONFIG_PARAVIRT
 #endif
diff --git a/arch/loongarch/include/asm/qspinlock.h 
b/arch/loongarch/include/asm/qspinlock.h
new file mode 100644
index ..b2d53b8c6679
--- /dev/null
+++ b/arch/loongarch/include/asm/qspinlock.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_QSPINLOCK_H
+#define _ASM_LOONGARCH_QSPINLOCK_H
+
+#include 
+#include 
+
+#ifdef CONFIG_PARAVIRT
+
+DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
+
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+   int val;
+
+   if (static_branch_likely(&virt_spin_lock_key))
+   return false;
+
+   /*
+* On hypervisors without PARAVIRT_SPINLOCKS support we fall
+* back to a Test-and-Set spinlock, because fair locks have
+* horrible lock 'holder' preemption issues.
+*/
+
+__retry:
+   val = atomic_read(&lock->val);
+
+   if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+   cpu_relax();
+   goto __retry;
+   }
+
+   return true;
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+#include 
+
+#endif // _ASM_LOONGARCH_QSPINLOCK_H
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index 9c9b75b76f62..49ebc54bdbcb 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 
+DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
 static int has_steal_clock;
 struct static_key paravirt_steal_enabled;
 struct static_key paravirt_steal_rq_enabled;
@@ -300,3 +301,11 @@ int __init pv_time_init(void)
 
return 0;
 }
+
+void __init pv_spinlock_init(void)
+{
+   if (!cpu_has_hypervisor)
+   return;
+
+   static_branch_disable(&virt_spin_lock_key);
+}
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 0f0740f0be27..70a670efe3cf 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -599,6 +599,11 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
reserve_initrd_mem();
 
+   /*
+* Initialise the static keys early as they may be enabled by the
+* cpufeature code and early parameters.
+*/
+   jump_label_init();
platform_init();
arch_mem_init(cmdline_p);
 
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index ca405ab86aae..f499bff1050b 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -509,6 +509,8 @@ void smp_prepare_boot_cpu(void)

Re: [PATCH v3 02/11] arm64: dts: qcom: sm6115-pro1x: Add PCA9534 IO Expander

2024-07-31 Thread Caleb Connolly





On 31/07/2024 08:18, Dang Huynh wrote:

F(x)tec Pro1X comes with PCA9534 IO Expander, it is used for enabling
touch screen VDD/VDDIO and keyboard's caps lock LED.

Reviewed-by: Konrad Dybcio 
Signed-off-by: Dang Huynh 
---
  arch/arm64/boot/dts/qcom/sm6115-fxtec-pro1x.dts | 21 +
  1 file changed, 21 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm6115-fxtec-pro1x.dts 
b/arch/arm64/boot/dts/qcom/sm6115-fxtec-pro1x.dts
index 70f479a63f2e..47e446249af6 100644
--- a/arch/arm64/boot/dts/qcom/sm6115-fxtec-pro1x.dts
+++ b/arch/arm64/boot/dts/qcom/sm6115-fxtec-pro1x.dts
@@ -70,6 +70,23 @@ &dispcc {
status = "disabled";
  };
  
+&gpi_dma0 {

+   status = "okay";
+};
+
+&i2c1 {
+   status = "okay";
+   /* Clock frequency was not specified downstream, let's park it to 100 
KHz */


This is the default, so you can drop this comment.

Reviewed-by: Caleb Connolly 

+   clock-frequency = <10>;
+
+   pca9534: gpio@21 {
+   compatible = "nxp,pca9534";
+   reg = <0x21>;
+   gpio-controller;
+   #gpio-cells = <2>;
+   };
+};
+
  &pm6125_gpios {
vol_up_n: vol-up-n-state {
pins = "gpio5";
@@ -89,6 +106,10 @@ &pon_resin {
status = "okay";
  };
  
+&qupv3_id_0 {

+   status = "okay";
+};
+
  &rpm_requests {
regulators-0 {
compatible = "qcom,rpm-pm6125-regulators";



--
// Caleb (they/them)

[PATCH v2 0/2] Support multiple reserved memory regions

2024-07-31 Thread Shun-yi Wang

From: "shun-yi.wang" 

Besides the reserved memory region for SCP, there are additional 
reserved memory regions for specific hardware use.
Currently, only a single memory region is supported.
Modifications are made to support multiple memory regions.

Changes in v2:
 - Modify description of memory region in dt-bindings document
 - Fix comments in v1, initial value and from '!i' 'i == 0'
 - Link to v1: 
https://lore.kernel.org/all/20240703115308.17436-1-shun-yi.w...@mediatek.com

shun-yi.wang (2):
  dt-bindings: remoteproc: Support multiple reserved memory regions
  remoteproc: mediatek: Support multiple reserved memory regions

 .../bindings/remoteproc/mtk,scp.yaml  |  8 --
 drivers/remoteproc/mtk_scp.c  | 27 ---
 2 files changed, 24 insertions(+), 11 deletions(-)

-- 
2.18.0

[PATCH v2 1/2] dt-bindings: remoteproc: Support multiple reserved memory regions

2024-07-31 Thread Shun-yi Wang

From: "shun-yi.wang" 

Remove the maximum number of 1 for memory regions.
Instead, add some descriptions to ensure the integrity
of the documentation.

Signed-off-by: shun-yi.wang 
---
 Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml 
b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
index d05d1563ec19..3362c8ffdccc 100644
--- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
+++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
@@ -55,7 +55,9 @@ properties:
   initializing SCP.
 
   memory-region:
-maxItems: 1
+description:
+  List of phandles to the reserved memory nodes used by
+  remoteproc devices.
 
   cros-ec-rpmsg:
 $ref: /schemas/mfd/google,cros-ec.yaml
@@ -123,7 +125,9 @@ patternProperties:
   initializing sub cores of multi-core SCP.
 
   memory-region:
-maxItems: 1
+description:
+  List of phandles to the reserved memory nodes used by
+  remoteproc devices.
 
   cros-ec-rpmsg:
 $ref: /schemas/mfd/google,cros-ec.yaml
-- 
2.18.0

[PATCH v2 2/2] remoteproc: mediatek: Support multiple reserved memory regions

2024-07-31 Thread Shun-yi Wang

From: "shun-yi.wang" 

SCP supports multiple reserved memory regions, intended for
specific hardwards.

Signed-off-by: shun-yi.wang 
---
 drivers/remoteproc/mtk_scp.c | 27 ++-
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c
index 857a61760b27..0e799d0bf73d 100644
--- a/drivers/remoteproc/mtk_scp.c
+++ b/drivers/remoteproc/mtk_scp.c
@@ -1006,20 +1006,29 @@ EXPORT_SYMBOL_GPL(scp_mapping_dm_addr);
 
 static int scp_map_memory_region(struct mtk_scp *scp)
 {
-   int ret;
const struct mtk_scp_sizes_data *scp_sizes;
+   struct device_node *node = scp->dev->of_node;
+   struct of_phandle_iterator it;
+   int ret, err;
+   int i = 0;
 
-   ret = of_reserved_mem_device_init(scp->dev);
+   of_for_each_phandle(&it, err, node, "memory-region", NULL, 0) {
+   ret = of_reserved_mem_device_init_by_idx(scp->dev, node, i);
 
-   /* reserved memory is optional. */
-   if (ret == -ENODEV) {
-   dev_info(scp->dev, "skipping reserved memory initialization.");
-   return 0;
+   if (ret) {
+   dev_err(scp->dev, "failed to assign memory-region: 
%s\n",
+   it.node->name);
+   of_node_put(it.node);
+   return -ENOMEM;
+   }
+
+   i++;
}
 
-   if (ret) {
-   dev_err(scp->dev, "failed to assign memory-region: %d\n", ret);
-   return -ENOMEM;
+   /* reserved memory is optional. */
+   if (i == 0) {
+   dev_dbg(scp->dev, "skipping reserved memory initialization.");
+   return 0;
}
 
/* Reserved SCP code size */
-- 
2.18.0

Re: [PATCH] x86/cpufeatures: SGX: Adjust the error message when BIOS does not support SGX

2024-07-31 Thread Huang, Kai

On Wed, 2024-07-31 at 11:22 +0800, WangYuli wrote:
> On 2024/7/30 19:57, Huang, Kai wrote:
> 
> > +linux-sgx list, Jarkko, Haitao.
> > 
> > This message is only printed when SGX is reported in CPUID but is not
> > enabled in the FEAT_CTL MSR.  I can only recall this can happen when the
> > BIOS actually provides an option for the user to turn on/off SGX, in
> > which case the current message is correct.
> > 
> > I could be wrong, but I don't recall I have met any machine that doesn't
> > have any SGX option in the BIOS but still reports SGX in the CPUID.  Can
> > you confirm this is the case?
> 
> Sure.
> 
> For example, Lenovo ThinkPad T480s that compliance id is TP00092A.
> 
> 

Fair enough.  I guess the updated message is slightly better:

Acked-by: Kai Huang 

Btw, I think there are some issues in the patch title/changelog.  Please
fix them.

E.g., I think the format of patch title should be:

x86/cpu: ...

In the changelog please avoid using "we", and please use imperative mode
to describe the change.  More information please see: 

https://docs.kernel.org/process/maintainer-tip.html

[PATCH] nvdimm/pmem: Set dax flag for all 'PFN_MAP' cases

2024-07-31 Thread Zhihao Cheng

The dax is only supportted on pfn type pmem devices since commit
f467fee48da4 ("block: move the dax flag to queue_limits"), fix it
by adding dax flag setting for the missed case.

Fixes: f467fee48da4 ("block: move the dax flag to queue_limits")
Signed-off-by: Zhihao Cheng 
---
 drivers/nvdimm/pmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 1ae8b2351654..210fb77f51ba 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -498,7 +498,7 @@ static int pmem_attach_disk(struct device *dev,
}
if (fua)
lim.features |= BLK_FEAT_FUA;
-   if (is_nd_pfn(dev))
+   if (is_nd_pfn(dev) || pmem_should_map_pages(dev))
lim.features |= BLK_FEAT_DAX;
 
if (!devm_request_mem_region(dev, res->start, resource_size(res),
-- 
2.39.2

Re: [PATCH v2 1/2] dt-bindings: remoteproc: Support multiple reserved memory regions

2024-07-31 Thread Krzysztof Kozlowski

On 31/07/2024 14:17, Shun-yi Wang wrote:
> From: "shun-yi.wang" 
> 
> Remove the maximum number of 1 for memory regions.

Why?

> Instead, add some descriptions to ensure the integrity
> of the documentation.

What? How is this related?

> 
> Signed-off-by: shun-yi.wang 
> ---
>  Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml 
> b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> index d05d1563ec19..3362c8ffdccc 100644
> --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> @@ -55,7 +55,9 @@ properties:
>initializing SCP.
>  
>memory-region:
> -maxItems: 1

No, no, no. Bindings must be specific/constrainted.

> +description:
> +  List of phandles to the reserved memory nodes used by
> +  remoteproc devices.

No, drop, it's entirely redundant and pointless. You did not add any new
information. This is always a list, always phandles and always reserved
memory regions. So what does it bring?

Please do not upstream random junk from your downstream kernel. :(

Best regards,
Krzysztof

[PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-07-31 Thread Yunsheng Lin

Currently the page_frag API is returning 'virtual address'
or 'va' when allocing and expecting 'virtual address' or
'va' as input when freeing.

As we are about to support new use cases that the caller
need to deal with 'struct page' or need to deal with both
'va' and 'struct page'. In order to differentiate the API
handling between 'va' and 'struct page', add '_va' suffix
to the corresponding API mirroring the page_pool_alloc_va()
API of the page_pool. So that callers expecting to deal with
va, page or both va and page may call page_frag_alloc_va*,
page_frag_alloc_pg*, or page_frag_alloc* API accordingly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Subbaraya Sundeep 
---
 drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
 .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
 drivers/nvme/host/tcp.c   |  8 +++
 drivers/nvme/target/tcp.c | 22 +--
 drivers/vhost/net.c   |  6 ++---
 include/linux/page_frag_cache.h   | 21 +-
 include/linux/skbuff.h|  2 +-
 kernel/bpf/cpumap.c   |  2 +-
 mm/page_frag_cache.c  | 12 +-
 mm/page_frag_test.c   | 13 ++-
 net/core/skbuff.c | 14 ++--
 net/core/xdp.c|  2 +-
 net/rxrpc/txbuf.c | 15 +++--
 net/sunrpc/svcsock.c  |  6 ++---
 19 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
b/drivers/net/ethernet/google/gve/gve_rx.c
index acb73d4d0de6..b6c10100e462 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
total_len = headroom + SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-   frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+   frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
if (!frame) {
u64_stats_update_begin(&rx->statss);
rx->xdp_alloc_fails++;
@@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct 
gve_rx_ring *rx,
 
err = xdp_do_redirect(dev, &new, xdp_prog);
if (err)
-   page_frag_free(frame);
+   page_frag_free_va(frame);
 
return err;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8bb743f78fcb..399b317c509d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct 
ice_tx_buf *tx_buf)
dev_kfree_skb_any(tx_buf->skb);
break;
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame(tx_buf->xdpf);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
b/drivers/net/ethernet/intel/ice/ice_txrx.h
index feba314a3fe4..6379f57d8228 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
  * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
  * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
  * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
- * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
+ * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
  * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
  * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
  */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 2719f0e20933..a1a41a14df0d 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf 
*tx_buf,
 
switch (tx_buf->type) {
case ICE_TX_BUF_XDP_TX:
-   page_frag_free(tx_buf->raw_buf);
+   page_frag_free_va(tx_buf->raw_buf);
break;
case ICE_TX_BUF_XDP_XMIT:
xdp_return_frame_bulk(tx_buf->xdpf, bq);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drive

[PATCH net-next v12 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-07-31 Thread Yunsheng Lin

Use appropriate frag_page API instead of caller accessing
'page_frag_cache' directly.

CC: Alexander Duyck 
Signed-off-by: Yunsheng Lin 
Reviewed-by: Alexander Duyck 
---
 drivers/vhost/net.c |  2 +-
 include/linux/page_frag_cache.h | 10 ++
 mm/page_frag_test.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/rxrpc/conn_object.c |  4 +---
 net/rxrpc/local_object.c|  4 +---
 net/sunrpc/svcsock.c|  6 ++
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6691fac01e0d..b2737dc0dc50 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
-   n->pf_cache.va = NULL;
+   page_frag_cache_init(&n->pf_cache);
 
return 0;
 }
diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
index ef038a07925c..7c9125a9aed3 100644
--- a/include/linux/page_frag_cache.h
+++ b/include/linux/page_frag_cache.h
@@ -7,6 +7,16 @@
 #include 
 #include 
 
+static inline void page_frag_cache_init(struct page_frag_cache *nc)
+{
+   nc->va = NULL;
+}
+
+static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
+{
+   return !!nc->pfmemalloc;
+}
+
 void page_frag_cache_drain(struct page_frag_cache *nc);
 void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
index 9eaa3ab74b29..6df8d8865afe 100644
--- a/mm/page_frag_test.c
+++ b/mm/page_frag_test.c
@@ -344,7 +344,7 @@ static int __init page_frag_test_init(void)
u64 duration;
int ret;
 
-   test_frag.va = NULL;
+   page_frag_cache_init(&test_frag);
atomic_set(&nthreads, 2);
init_completion(&wait);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4b8acd967793..76a473b1072d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
*dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
 
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc_va(nc, len, gfp_mask);
-   pfmemalloc = nc->pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
 
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
unsigned int len)
len = SKB_HEAD_ALIGN(len);
 
data = page_frag_alloc_va(&nc->page, len, gfp_mask);
-   pfmemalloc = nc->page.pfmemalloc;
+   pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 1539d315afe7..694c4df7a1a3 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
*work)
 */
rxrpc_purge_queue(&conn->rx_queue);
 
-   if (conn->tx_data_alloc.va)
-   __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
-   conn->tx_data_alloc.pagecnt_bias);
+   page_frag_cache_drain(&conn->tx_data_alloc);
call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
 }
 
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 504453c688d7..a8cffe47cf01 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
 #endif
rxrpc_purge_queue(&local->rx_queue);
rxrpc_purge_client_connections(local);
-   if (local->tx_alloc.va)
-   __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
-   local->tx_alloc.pagecnt_bias);
+   page_frag_cache_drain(&local->tx_alloc);
 }
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 42d20412c1c3..4b1e87187614 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1609,7 +1609,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 static void svc_sock_free(struct svc_xprt *xprt)
 {
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-   struct page_frag_cache *pfc = &svsk->sk_frag_cache;
struct socket *sock = svsk->sk_sock;

Re: [PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-07-31 Thread Chuck Lever

On Wed, Jul 31, 2024 at 08:44:54PM +0800, Yunsheng Lin wrote:
> Currently the page_frag API is returning 'virtual address'
> or 'va' when allocing and expecting 'virtual address' or
> 'va' as input when freeing.
> 
> As we are about to support new use cases that the caller
> need to deal with 'struct page' or need to deal with both
> 'va' and 'struct page'. In order to differentiate the API
> handling between 'va' and 'struct page', add '_va' suffix
> to the corresponding API mirroring the page_pool_alloc_va()
> API of the page_pool. So that callers expecting to deal with
> va, page or both va and page may call page_frag_alloc_va*,
> page_frag_alloc_pg*, or page_frag_alloc* API accordingly.
> 
> CC: Alexander Duyck 
> Signed-off-by: Yunsheng Lin 
> Reviewed-by: Subbaraya Sundeep 

For the net/sunrpc/svcsock.c hunk:

Acked-by: Chuck Lever 


> ---
>  drivers/net/ethernet/google/gve/gve_rx.c  |  4 ++--
>  drivers/net/ethernet/intel/ice/ice_txrx.c |  2 +-
>  drivers/net/ethernet/intel/ice/ice_txrx.h |  2 +-
>  drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  2 +-
>  .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  4 ++--
>  .../marvell/octeontx2/nic/otx2_common.c   |  2 +-
>  drivers/net/ethernet/mediatek/mtk_wed_wo.c|  4 ++--
>  drivers/nvme/host/tcp.c   |  8 +++
>  drivers/nvme/target/tcp.c | 22 +--
>  drivers/vhost/net.c   |  6 ++---
>  include/linux/page_frag_cache.h   | 21 +-
>  include/linux/skbuff.h|  2 +-
>  kernel/bpf/cpumap.c   |  2 +-
>  mm/page_frag_cache.c  | 12 +-
>  mm/page_frag_test.c   | 13 ++-
>  net/core/skbuff.c | 14 ++--
>  net/core/xdp.c|  2 +-
>  net/rxrpc/txbuf.c | 15 +++--
>  net/sunrpc/svcsock.c  |  6 ++---
>  19 files changed, 74 insertions(+), 69 deletions(-)
> 
> diff --git a/drivers/net/ethernet/google/gve/gve_rx.c 
> b/drivers/net/ethernet/google/gve/gve_rx.c
> index acb73d4d0de6..b6c10100e462 100644
> --- a/drivers/net/ethernet/google/gve/gve_rx.c
> +++ b/drivers/net/ethernet/google/gve/gve_rx.c
> @@ -729,7 +729,7 @@ static int gve_xdp_redirect(struct net_device *dev, 
> struct gve_rx_ring *rx,
>  
>   total_len = headroom + SKB_DATA_ALIGN(len) +
>   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> - frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
> + frame = page_frag_alloc_va(&rx->page_cache, total_len, GFP_ATOMIC);
>   if (!frame) {
>   u64_stats_update_begin(&rx->statss);
>   rx->xdp_alloc_fails++;
> @@ -742,7 +742,7 @@ static int gve_xdp_redirect(struct net_device *dev, 
> struct gve_rx_ring *rx,
>  
>   err = xdp_do_redirect(dev, &new, xdp_prog);
>   if (err)
> - page_frag_free(frame);
> + page_frag_free_va(frame);
>  
>   return err;
>  }
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c 
> b/drivers/net/ethernet/intel/ice/ice_txrx.c
> index 8bb743f78fcb..399b317c509d 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx.c
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
> @@ -126,7 +126,7 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, 
> struct ice_tx_buf *tx_buf)
>   dev_kfree_skb_any(tx_buf->skb);
>   break;
>   case ICE_TX_BUF_XDP_TX:
> - page_frag_free(tx_buf->raw_buf);
> + page_frag_free_va(tx_buf->raw_buf);
>   break;
>   case ICE_TX_BUF_XDP_XMIT:
>   xdp_return_frame(tx_buf->xdpf);
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h 
> b/drivers/net/ethernet/intel/ice/ice_txrx.h
> index feba314a3fe4..6379f57d8228 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx.h
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
> @@ -148,7 +148,7 @@ static inline int ice_skb_pad(void)
>   * @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
>   * @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
>   * @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
> - * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
> + * @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free_va(), stats
>   * @ICE_TX_BUF_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame(), stats
>   * @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
>   */
> diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
> b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> index 2719f0e20933..a1a41a14df0d 100644
> --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
> @@ -250,7 +250,7 @@ ice_clean_xdp_tx_buf(struct device *dev, struct 
> ice_tx_buf *tx_buf,
>  
>   switch (tx_buf->type) {
>   case ICE_TX_BUF_XDP_TX:
>

Re: [PATCH net-next v12 05/14] mm: page_frag: avoid caller accessing 'page_frag_cache' directly

2024-07-31 Thread Chuck Lever

On Wed, Jul 31, 2024 at 08:44:55PM +0800, Yunsheng Lin wrote:
> Use appropriate frag_page API instead of caller accessing
> 'page_frag_cache' directly.
> 
> CC: Alexander Duyck 
> Signed-off-by: Yunsheng Lin 
> Reviewed-by: Alexander Duyck 

For the net/sunrpc/svcsock.c hunk:

Acked-by: Chuck Lever 


> ---
>  drivers/vhost/net.c |  2 +-
>  include/linux/page_frag_cache.h | 10 ++
>  mm/page_frag_test.c |  2 +-
>  net/core/skbuff.c   |  6 +++---
>  net/rxrpc/conn_object.c |  4 +---
>  net/rxrpc/local_object.c|  4 +---
>  net/sunrpc/svcsock.c|  6 ++
>  7 files changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 6691fac01e0d..b2737dc0dc50 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
> file *f)
>   vqs[VHOST_NET_VQ_RX]);
>  
>   f->private_data = n;
> - n->pf_cache.va = NULL;
> + page_frag_cache_init(&n->pf_cache);
>  
>   return 0;
>  }
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index ef038a07925c..7c9125a9aed3 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -7,6 +7,16 @@
>  #include 
>  #include 
>  
> +static inline void page_frag_cache_init(struct page_frag_cache *nc)
> +{
> + nc->va = NULL;
> +}
> +
> +static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
> +{
> + return !!nc->pfmemalloc;
> +}
> +
>  void page_frag_cache_drain(struct page_frag_cache *nc);
>  void __page_frag_cache_drain(struct page *page, unsigned int count);
>  void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
> diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
> index 9eaa3ab74b29..6df8d8865afe 100644
> --- a/mm/page_frag_test.c
> +++ b/mm/page_frag_test.c
> @@ -344,7 +344,7 @@ static int __init page_frag_test_init(void)
>   u64 duration;
>   int ret;
>  
> - test_frag.va = NULL;
> + page_frag_cache_init(&test_frag);
>   atomic_set(&nthreads, 2);
>   init_completion(&wait);
>  
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 4b8acd967793..76a473b1072d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -749,14 +749,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device 
> *dev, unsigned int len,
>   if (in_hardirq() || irqs_disabled()) {
>   nc = this_cpu_ptr(&netdev_alloc_cache);
>   data = page_frag_alloc_va(nc, len, gfp_mask);
> - pfmemalloc = nc->pfmemalloc;
> + pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
>   } else {
>   local_bh_disable();
>   local_lock_nested_bh(&napi_alloc_cache.bh_lock);
>  
>   nc = this_cpu_ptr(&napi_alloc_cache.page);
>   data = page_frag_alloc_va(nc, len, gfp_mask);
> - pfmemalloc = nc->pfmemalloc;
> + pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
>  
>   local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
>   local_bh_enable();
> @@ -846,7 +846,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, 
> unsigned int len)
>   len = SKB_HEAD_ALIGN(len);
>  
>   data = page_frag_alloc_va(&nc->page, len, gfp_mask);
> - pfmemalloc = nc->page.pfmemalloc;
> + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
>   }
>   local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
>  
> diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
> index 1539d315afe7..694c4df7a1a3 100644
> --- a/net/rxrpc/conn_object.c
> +++ b/net/rxrpc/conn_object.c
> @@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct 
> *work)
>*/
>   rxrpc_purge_queue(&conn->rx_queue);
>  
> - if (conn->tx_data_alloc.va)
> - __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va),
> - conn->tx_data_alloc.pagecnt_bias);
> + page_frag_cache_drain(&conn->tx_data_alloc);
>   call_rcu(&conn->rcu, rxrpc_rcu_free_connection);
>  }
>  
> diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
> index 504453c688d7..a8cffe47cf01 100644
> --- a/net/rxrpc/local_object.c
> +++ b/net/rxrpc/local_object.c
> @@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local)
>  #endif
>   rxrpc_purge_queue(&local->rx_queue);
>   rxrpc_purge_client_connections(local);
> - if (local->tx_alloc.va)
> - __page_frag_cache_drain(virt_to_page(local->tx_alloc.va),
> - local->tx_alloc.pagecnt_bias);
> + page_frag_cache_drain(&local->tx_alloc);
>  }
>  
>  /*
> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
> index 42d20412c1c3..4b1e87187614 100644
> --- a/net/sunrpc/svcsock.c
> +++ b/net/sunrpc/svcsock.c
> @@ -1609,7 +1609,6 @@ st

Re: [PATCH v2 1/2] dt-bindings: remoteproc: Support multiple reserved memory regions

2024-07-31 Thread 王順億

Hi Krzysztof,

Thanks for the reviews.

On Wed, 2024-07-31 at 14:40 +0200, Krzysztof Kozlowski wrote:
>
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>  On 31/07/2024 14:17, Shun-yi Wang wrote:
> > From: "shun-yi.wang" 
> > 
> > Remove the maximum number of 1 for memory regions.
> 
> Why?
> 

For future applications, MTK SCP will reserve multiple regions for
specific hardware use.

> > Instead, add some descriptions to ensure the integrity
> > of the documentation.
> 
> What? How is this related?
> 

My original thinking was to keep the memory-region option.
But currently, there is no maximum value limitation, so I
add some description. Should I just drop the description directly?

Best regards,
Shun-yi

> > 
> > Signed-off-by: shun-yi.wang 
> > ---
> >  Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 8
> ++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> > 
> > diff --git
> a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> > index d05d1563ec19..3362c8ffdccc 100644
> > --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> > +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml
> > @@ -55,7 +55,9 @@ properties:
> >initializing SCP.
> >  
> >memory-region:
> > -maxItems: 1
> 
> No, no, no. Bindings must be specific/constrainted.
> 
> > +description:
> > +  List of phandles to the reserved memory nodes used by
> > +  remoteproc devices.
> 
> No, drop, it's entirely redundant and pointless. You did not add any
> new
> information. This is always a list, always phandles and always
> reserved
> memory regions. So what does it bring?
> 
> Please do not upstream random junk from your downstream kernel. :(
> 
> Best regards,
> Krzysztof
>

Re: [PATCH v2 1/2] dt-bindings: remoteproc: Support multiple reserved memory regions

2024-07-31 Thread Krzysztof Kozlowski

On 31/07/2024 15:41, Shun-Yi Wang (王順億) wrote:
> Hi Krzysztof,
> 
> Thanks for the reviews.
> 
> On Wed, 2024-07-31 at 14:40 +0200, Krzysztof Kozlowski wrote:
>>   
>> External email : Please do not click links or open attachments until
>> you have verified the sender or the content.
>>  On 31/07/2024 14:17, Shun-yi Wang wrote:
>>> From: "shun-yi.wang" 
>>>
>>> Remove the maximum number of 1 for memory regions.
>>
>> Why?
>>
> 
> For future applications, MTK SCP will reserve multiple regions for
> specific hardware use.

That's not a reason to drop constrain on an entry.

> 
>>> Instead, add some descriptions to ensure the integrity
>>> of the documentation.
>>
>> What? How is this related?
>>
> 
> My original thinking was to keep the memory-region option.
> But currently, there is no maximum value limitation, so I
> add some description. Should I just drop the description directly?

Read all comments.

Best regards,
Krzysztof

Re: [PATCH] uprobes: Remove redundant spinlock in uprobe_deny_signal

2024-07-31 Thread Oleg Nesterov

On 07/31, Liao Chang wrote:
>
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -1979,9 +1979,7 @@ bool uprobe_deny_signal(void)
>   WARN_ON_ONCE(utask->state != UTASK_SSTEP);
>  
>   if (task_sigpending(t)) {
> - spin_lock_irq(&t->sighand->siglock);
>   clear_tsk_thread_flag(t, TIF_SIGPENDING);
> - spin_unlock_irq(&t->sighand->siglock);

Agreed, in this case ->siglock buys nothing, another signal can come
right after spin_unlock().

Acked-by: Oleg Nesterov

Re: [PATCH v11 00/12] Initial Marvell PXA1908 support

2024-07-31 Thread Rob Herring (Arm)



On Tue, 30 Jul 2024 12:25:08 +0200, Duje Mihanović wrote:
> Hello,
> 
> This series adds initial support for the Marvell PXA1908 SoC and
> "samsung,coreprimevelte", a smartphone using the SoC.
> 
> USB works and the phone can boot a rootfs from an SD card, but there are
> some warnings in the dmesg:
> 
> During SMP initialization:
> [0.006519] CPU features: SANITY CHECK: Unexpected variation in 
> SYS_CNTFRQ_EL0. Boot CPU: 0x00018cba80, CPU1: 0x00
> [0.006542] CPU features: Unsupported CPU feature variation detected.
> [0.006589] CPU1: Booted secondary processor 0x01 [0x410fd032]
> [0.010710] Detected VIPT I-cache on CPU2
> [0.010716] CPU features: SANITY CHECK: Unexpected variation in 
> SYS_CNTFRQ_EL0. Boot CPU: 0x00018cba80, CPU2: 0x00
> [0.010758] CPU2: Booted secondary processor 0x02 [0x410fd032]
> [0.014849] Detected VIPT I-cache on CPU3
> [0.014855] CPU features: SANITY CHECK: Unexpected variation in 
> SYS_CNTFRQ_EL0. Boot CPU: 0x00018cba80, CPU3: 0x00
> [0.014895] CPU3: Booted secondary processor 0x03 [0x410fd032]
> 
> SMMU probing fails:
> [0.101798] arm-smmu c001.iommu: probing hardware configuration...
> [0.101809] arm-smmu c001.iommu: SMMUv1 with:
> [0.101816] arm-smmu c001.iommu: no translation support!
> 
> A 3.14 based Marvell tree is available on GitHub
> acorn-marvell/brillo_pxa_kernel, and a Samsung one on GitHub
> CoderCharmander/g361f-kernel.
> 
> Andreas Färber attempted to upstream support for this SoC in 2017:
> https://lore.kernel.org/lkml/20170222022929.10540-1-afaer...@suse.de/
> 
> Signed-off-by: Duje Mihanović 
> 
> Changes in v11:
> - Rebase on v6.11-rc1 (conflict with DTS Makefile), no changes
> - Link to v10: 
> https://lore.kernel.org/r/20240424-pxa1908-lkml-v10-0-36cdfb584...@skole.hr
> 
> Changes in v10:
> - Update trailers
> - Rebase on v6.9-rc5
> - Clock driver changes:
>   - Add a couple of forgotten clocks in APBC
> - The clocks are thermal_clk, ipc_clk, ssp0_clk, ssp2_clk and swjtag
> - The IDs and register offsets were already present, but I forgot to
>   actually register them
>   - Split each controller block into own file
>   - Drop unneeded -of in clock driver filenames
>   - Simplify struct pxa1908_clk_unit
>   - Convert to platform driver
>   - Add module metadata
> - DTS changes:
>   - Properly name pinctrl nodes
>   - Drop pinctrl #size-cells, #address-cells, ranges and #gpio-size-cells
>   - Fix pinctrl input-schmitt configuration
> - Link to v9: 
> https://lore.kernel.org/20240402-pxa1908-lkml-v9-0-25a003e83...@skole.hr
> 
> Changes in v9:
> - Update trailers and rebase on v6.9-rc2, no changes
> - Link to v8: 
> https://lore.kernel.org/20240110-pxa1908-lkml-v8-0-fea768a59...@skole.hr
> 
> Changes in v8:
> - Drop SSPA patch
> - Drop broken-cd from eMMC node
> - Specify S-Boot hardcoded initramfs location in device tree
> - Add ARM PMU node
> - Correct inverted modem memory base and size
> - Update trailers
> - Rebase on next-20240110
> - Link to v7: 
> https://lore.kernel.org/20231102-pxa1908-lkml-v7-0-cabb1a0cb...@skole.hr
>   and https://lore.kernel.org/20231102152033.5511-1-duje.mihano...@skole.hr
> 
> Changes in v7:
> - Suppress SND_MMP_SOC_SSPA on ARM64
> - Update trailers
> - Rebase on v6.6-rc7
> - Link to v6: 
> https://lore.kernel.org/r/20231010-pxa1908-lkml-v6-0-b2fe09240...@skole.hr
> 
> Changes in v6:
> - Address maintainer comments:
>   - Add "marvell,pxa1908-padconf" binding to pinctrl-single driver
> - Drop GPIO patch as it's been pulled
> - Update trailers
> - Rebase on v6.6-rc5
> - Link to v5: 
> https://lore.kernel.org/r/20230812-pxa1908-lkml-v5-0-a5d51937e...@skole.hr
> 
> Changes in v5:
> - Address maintainer comments:
>   - Move *_NR_CLKS to clock driver from dt binding file
> - Allocate correct number of clocks for each block instead of blindly
>   allocating 50 for each
> - Link to v4: 
> https://lore.kernel.org/r/20230807-pxa1908-lkml-v4-0-cb387d73b...@skole.hr
> 
> Changes in v4:
> - Address maintainer comments:
>   - Relicense clock binding file to BSD-2
> - Add pinctrl-names to SD card node
> - Add vgic registers to GIC node
> - Rebase on v6.5-rc5
> - Link to v3: 
> https://lore.kernel.org/r/20230804-pxa1908-lkml-v3-0-8e48fca37...@skole.hr
> 
> Changes in v3:
> - Address maintainer comments:
>   - Drop GPIO dynamic allocation patch
>   - Move clock register offsets into driver (instead of bindings file)
>   - Add missing Tested-by trailer to u32_fract patch
>   - Move SoC binding to arm/mrvl/mrvl.yaml
> - Add serial0 alias and stdout-path to board dts to enable UART
>   debugging
> - Rebase on v6.5-rc4
> - Link to v2: 
> https://lore.kernel.org/r/20230727162909.6031-1-duje.mihano...@skole.hr
> 
> Changes in v2:
> - Remove earlycon patch as it's been merged into tty-next
> - Address maintainer comments:
>   - Clarify GPIO regressions on older PXA platforms
>   - Add Fixes tag to co

[PATCH v2] x86/cpu: Adjust the error message when BIOS does not support SGX

2024-07-31 Thread WangYuli

When SGX is not supported by the BIOS, the kernel log still output
the error 'SGX disabled by BIOS', which can be confusing since
there might not be an SGX-related option in the BIOS settings.

As a kernel, it's difficult to distinguish between the BIOS not
supporting SGX and the BIOS supporting SGX but it's disabled.

Therefore, update the error message to
'SGX disabled or unsupported by BIOS' to make it easier for those
reading kernel logs to understand what's happening.

Reported-by: Bo Wu 
Link: https://github.com/linuxdeepin/developer-center/issues/10032
Reviewed-by: Kai Huang 
Link: 
https://lore.kernel.org/all/a30f7700c7817b3e7e2f2bdb37d5c10a318b2c3b.ca...@intel.com/
Signed-off-by: Zelong Xiang 
Signed-off-by: WangYuli 
---
 arch/x86/kernel/cpu/feat_ctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 1640ae76548f..4a4118784c13 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -188,7 +188,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 update_sgx:
if (!(msr & FEAT_CTL_SGX_ENABLED)) {
if (enable_sgx_kvm || enable_sgx_driver)
-   pr_err_once("SGX disabled by BIOS.\n");
+   pr_err_once("SGX disabled or unsupported by BIOS.\n");
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
-- 
2.43.4

Re: [PATCH v2 3/6] rtla/utils: Add idle state disabling via libcpupower

2024-07-31 Thread Steven Rostedt

On Wed, 31 Jul 2024 10:36:52 +0200
tglo...@redhat.com wrote:

> From: Tomas Glozar 
> 
> Add functions to utils.c to disable idle states through functions of
> libcpupower. This will serve as the basis for disabling idle states
> per cpu when running timerlat.
> 
> Signed-off-by: Tomas Glozar 
> ---
>  tools/tracing/rtla/src/utils.c | 140 +
>  tools/tracing/rtla/src/utils.h |   6 ++
>  2 files changed, 146 insertions(+)
> 
> diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
> index 9ac71a66840c..9279b8ce08c3 100644
> --- a/tools/tracing/rtla/src/utils.c
> +++ b/tools/tracing/rtla/src/utils.c
> @@ -4,6 +4,9 @@
>   */
>  
>  #define _GNU_SOURCE
> +#ifdef HAVE_LIBCPUPOWER_SUPPORT
> +#include 
> +#endif /* HAVE_LIBCPUPOWER_SUPPORT */
>  #include 
>  #include 
>  #include 
> @@ -519,6 +522,143 @@ int set_cpu_dma_latency(int32_t latency)
>   return fd;
>  }
>  
> +#ifdef HAVE_LIBCPUPOWER_SUPPORT
> +static unsigned int **saved_cpu_idle_disable_state;
> +static size_t saved_cpu_idle_disable_state_alloc_ctr;
> +
> +/*
> + * save_cpu_idle_state_disable - save disable for all idle states of a cpu
> + *
> + * Saves the current disable of all idle states of a cpu, to be subsequently
> + * restored via restore_cpu_idle_disable_state.
> + *
> + * Return: idle state count on success, negative on error
> + */
> +int save_cpu_idle_disable_state(unsigned int cpu)
> +{
> + unsigned int nr_states;
> + unsigned int state;
> + int disabled;
> + int nr_cpus;
> +
> + nr_states = cpuidle_state_count(cpu);
> +
> + if (nr_states == 0)
> + return 0;
> +
> + if (saved_cpu_idle_disable_state == NULL) {
> + nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
> + saved_cpu_idle_disable_state = calloc(nr_cpus, sizeof(unsigned 
> int *));

Need to check if the calloc failed and return an error if it did.

> + }
> +
> + saved_cpu_idle_disable_state[cpu] = calloc(nr_states, sizeof(unsigned 
> int));

Here too.

> + saved_cpu_idle_disable_state_alloc_ctr++;
> +
> + for (state = 0; state < nr_states; state++) {
> + disabled = cpuidle_is_state_disabled(cpu, state);
> + if (disabled < 0)
> + return disabled;

Hmm, should this warn if state is not zero and disabled is negative.

> + saved_cpu_idle_disable_state[cpu][state] = disabled;
> + }
> +
> + return nr_states;
> +}
> +
> +/*
> + * restore_cpu_idle_disable_state - restore disable for all idle states of a 
> cpu
> + *
> + * Restores the current disable state of all idle states of a cpu that was
> + * previously saved by save_cpu_idle_disable_state.
> + *
> + * Return: idle state count on success, negative on error
> + */
> +int restore_cpu_idle_disable_state(unsigned int cpu)
> +{
> + unsigned int nr_states;
> + unsigned int state;
> + int disabled;
> + int result;
> +

Should probably have a check to see if saved_cpu_idle_disable exists.

> + nr_states = cpuidle_state_count(cpu);
> +
> + if (nr_states == 0)
> + return 0;
> +

As well as saved_cpu_idle_disable_state[cpu] exists.

Just for robustness.

> + for (state = 0; state < nr_states; state++) {
> + disabled = saved_cpu_idle_disable_state[cpu][state];
> + result = cpuidle_state_disable(cpu, state, disabled);
> + if (result < 0)
> + return result;
> + }
> +
> + free(saved_cpu_idle_disable_state[cpu]);
> + saved_cpu_idle_disable_state[cpu] = NULL;
> + saved_cpu_idle_disable_state_alloc_ctr--;
> + if (saved_cpu_idle_disable_state_alloc_ctr == 0) {
> + free(saved_cpu_idle_disable_state);
> + saved_cpu_idle_disable_state = NULL;
> + }
> +
> + return nr_states;
> +}
> +
> +/*
> + * free_cpu_idle_disable_states - free saved idle state disable for all cpus
> + *
> + * Frees the memory used for storing cpu idle state disable for all cpus
> + * and states.
> + *
> + * Normally, the memory is freed automatically in
> + * restore_cpu_idle_disable_state; this is mostly for cleaning up after an
> + * error.
> + */
> +void free_cpu_idle_disable_states(void)
> +{
> + int cpu;
> +
> + if (!saved_cpu_idle_disable_state)
> + return;
> +
> + for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {

> + if (!saved_cpu_idle_disable_state[cpu])
> + continue;

No need to check here. free() works fine with passing NULL to it.

-- Steve

> + free(saved_cpu_idle_disable_state[cpu]);
> + saved_cpu_idle_disable_state[cpu] = NULL;
> + }
> +
> + free(saved_cpu_idle_disable_state);
> + saved_cpu_idle_disable_state = NULL;
> +}
> +
> +/*
> + * set_deepest_cpu_idle_state - limit idle state of cpu
> + *
> + * Disables all idle states deeper than the one given in
> + * deepest_state (assuming states with higher number are deeper).
> + *
> + *

Re: [PATCH v2 4/6] rtla/timerlat: Add --deepest-idle-state for top

2024-07-31 Thread Steven Rostedt

On Wed, 31 Jul 2024 10:36:53 +0200
tglo...@redhat.com wrote:

> From: Tomas Glozar 
> 
> Add option to limit deepest idle state on CPUs where timerlat is running
> for the duration of the workload.
> 
> Signed-off-by: Tomas Glozar 
> ---
>  tools/tracing/rtla/src/timerlat_top.c | 46 ++-
>  1 file changed, 45 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/tracing/rtla/src/timerlat_top.c 
> b/tools/tracing/rtla/src/timerlat_top.c
> index 8c16419fe22a..ef1d3affef95 100644
> --- a/tools/tracing/rtla/src/timerlat_top.c
> +++ b/tools/tracing/rtla/src/timerlat_top.c
> @@ -48,6 +48,7 @@ struct timerlat_top_params {
>   int pretty_output;
>   int warmup;
>   int buffer_size;
> + int deepest_idle_state;
>   cpu_set_t   hk_cpu_set;
>   struct sched_attr   sched_param;
>   struct trace_events *events;
> @@ -447,7 +448,7 @@ static void timerlat_top_usage(char *usage)
>   "",
>   "  usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] 
> [-n] [-p us] [-i us] [-T us] [-s us] \\",
>   " [[-t[file]] [-e sys[:event]] [--filter ] 
> [--trigger ] [-c cpu-list] [-H cpu-list]\\",
> - " [-P priority] [--dma-latency us] [--aa-only us] 
> [-C[=cgroup_name]] [-u|-k] [--warm-up s]",
> + " [-P priority] [--dma-latency us] [--aa-only us] 
> [-C[=cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]",
>   "",
>   " -h/--help: print this menu",
>   " -a/--auto: set automatic trace mode, stopping the 
> session if argument in us latency is hit",
> @@ -481,6 +482,7 @@ static void timerlat_top_usage(char *usage)
>   " -U/--user-load: enable timerlat for user-defined 
> user-space workload",
>   "--warm-up s: let the workload run for s seconds 
> before collecting data",
>   "--trace-buffer-size kB: set the per-cpu trace 
> buffer size in kB",

Could probably do:

#ifdef HAVE_LIBCPUPOWER_SUPPORT
> + "--deepest-idle-state n: only go down to idle state 
> n on cpus used by timerlat to reduce exit from idle latency",
#else
+   "--deepest-idle-state n: [rtla built without 
libcpupower, --deepest-idle-state is not supported]",
#endif

>   NULL,
>   };
>  
> @@ -518,6 +520,9 @@ static struct timerlat_top_params
>   /* disabled by default */
>   params->dma_latency = -1;
>  
> + /* disabled by default */
> + params->deepest_idle_state = -2;
> +
>   /* display data in microseconds */
>   params->output_divisor = 1000;
>  
> @@ -550,6 +555,7 @@ static struct timerlat_top_params
>   {"aa-only", required_argument,  0, '5'},
>   {"warm-up", required_argument,  0, '6'},
>   {"trace-buffer-size",   required_argument,  0, '7'},
> + {"deepest-idle-state",  required_argument,  0, '8'},
>   {0, 0, 0, 0}
>   };
>  
> @@ -726,6 +732,9 @@ static struct timerlat_top_params
>   case '7':
>   params->buffer_size = get_llong_from_str(optarg);
>   break;
> + case '8':
> + params->deepest_idle_state = get_llong_from_str(optarg);
> + break;
>   default:
>   timerlat_top_usage("Invalid option");
>   }
> @@ -922,6 +931,9 @@ int timerlat_top_main(int argc, char *argv[])
>   int return_value = 1;
>   char *max_lat;
>   int retval;
> +#ifdef HAVE_LIBCPUPOWER_SUPPORT
> + int i;
> +#endif /* HAVE_LIBCPUPOWER_SUPPORT */
>  
>   params = timerlat_top_parse_args(argc, argv);
>   if (!params)
> @@ -971,6 +983,26 @@ int timerlat_top_main(int argc, char *argv[])
>   }
>   }
>  
> + if (params->deepest_idle_state >= -1) {
> +#ifdef HAVE_LIBCPUPOWER_SUPPORT
> + for (i = 0; i < sysconf(_SC_NPROCESSORS_CONF); i++) {
> + if (params->cpus && !CPU_ISSET(i, 
> ¶ms->monitored_cpus))
> + continue;
> + if (save_cpu_idle_disable_state(i) < 0) {
> + err_msg("Could not save cpu idle state.\n");
> + goto out_free;
> + }
> + if (set_deepest_cpu_idle_state(i, 
> params->deepest_idle_state) < 0) {
> + err_msg("Could not set deepest cpu idle 
> state.\n");
> + goto out_free;
> + }
> + }
> +#else
> + err_msg("rtla built without libcpupower, --deepest-idle-state 
> is not supported\n");
> + goto out_free;
> +#endif /* HAVE_LIBCPU

Re: [PATCH] nvdimm/pmem: Set dax flag for all 'PFN_MAP' cases

2024-07-31 Thread Christoph Hellwig

Looks good:

Reviewed-by: Christoph Hellwig

Re: [PATCH v4 2/2] rust: add tracepoint support

2024-07-31 Thread Steven Rostedt

On Tue, 30 Jul 2024 11:35:27 +0100
Gary Guo  wrote:

> > +/*
> > + * Declare an exported function that Rust code can call to trigger this
> > + * tracepoint. This function does not include the static branch; that is 
> > done
> > + * in Rust to avoid a function call when the tracepoint is disabled.
> > + */
> > +#define DEFINE_RUST_DO_TRACE(name, proto, args)
> > +#define DEFINE_RUST_DO_TRACE_REAL(name, proto, args)   
> > \
> > +   notrace void rust_do_trace_##name(proto)\
> > +   {   \
> > +   __DO_TRACE(name,\
> > +   TP_ARGS(args),  \
> > +   cpu_online(raw_smp_processor_id()), 0); \  
> 
> I guess this doesn't support conditions. Currently the conditions are
> specified during declaration and not during definition.
> 
> Would it make sense to have
> 
>   static inline void do_trace_##name(proto)
>   {
>   __DO_TRACE(name, TP_ARGS(args), TP_CONDITION(cond), 0);

But where is the "cond" passed in from?

I guess in the future if you want to add conditions, you would then just
add:

#define DEFINE_RUST_DO_TRACE_REAL_CONDITION(name, proto, args, cond)
\
notrace void rust_do_trace_##name(proto)\
{   \
__DO_TRACE(name,\
TP_ARGS(args),  \
cpu_online(raw_smp_processor_id()) &&   \
TP_CONDITION(cond), 0); \  

-- Steve

>   }
> 
> in `__DECLARE_TRACE` and then simply call it in `rust_do_trace_##name`?
> 
> > +   }
> > +
> >  /*

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Andrii Nakryiko

On Mon, Jul 29, 2024 at 6:45 AM Oleg Nesterov  wrote:
>
> This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *"
> rather than inode + offset. This simplifies the code and allows to avoid
> the unnecessary find_uprobe() + put_uprobe() in these functions.
>
> TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that
> this uprobe can't be freed before up_write(&uprobe->register_rwsem).
>
> Signed-off-by: Oleg Nesterov 
> Acked-by: Andrii Nakryiko 
> ---
>  include/linux/uprobes.h | 15 +-
>  kernel/events/uprobes.c | 56 +++--
>  kernel/trace/bpf_trace.c| 25 -
>  kernel/trace/trace_uprobe.c | 26 -
>  4 files changed, 55 insertions(+), 67 deletions(-)
>

You'll need something like below to not break our bpf_testmod. And
please send pull patch sets, not individually updated patches, it's a
PITA to deal with. Thanks!

commit 9f739a9997ab833394196459fa7e6dd4d13dd48b (HEAD -> uprobes-oleg-cleanups)
Author: Andrii Nakryiko 
Date:   Wed Jul 31 09:15:46 2024 -0700

uprobes: fix bpf_testmod after uprobe_register/uprobe_unregister API change

Signed-off-by: Andrii Nakryiko 

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 5f152afdec2f..73a6b041bcce 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -431,6 +431,7 @@ uprobe_ret_handler(struct uprobe_consumer *self,
unsigned long func,
 }

 struct testmod_uprobe {
+   struct uprobe *uprobe;
struct path path;
loff_t offset;
struct uprobe_consumer consumer;
@@ -458,12 +459,14 @@ static int testmod_register_uprobe(loff_t offset)
if (err)
goto out;

-   err = uprobe_register(d_real_inode(uprobe.path.dentry),
- offset, 0, &uprobe.consumer);
-   if (err)
+   uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
+   offset, 0, &uprobe.consumer);
+   if (IS_ERR(uprobe.uprobe)) {
path_put(&uprobe.path);
-   else
+   uprobe.uprobe = NULL;
+   } else {
uprobe.offset = offset;
+   }

 out:
mutex_unlock(&testmod_uprobe_mutex);
@@ -474,10 +477,10 @@ static void testmod_unregister_uprobe(void)
 {
mutex_lock(&testmod_uprobe_mutex);

-   if (uprobe.offset) {
-   uprobe_unregister(d_real_inode(uprobe.path.dentry),
- uprobe.offset, &uprobe.consumer);
+   if (uprobe.uprobe) {
+   uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
uprobe.offset = 0;
+   uprobe.uprobe = NULL;
}

mutex_unlock(&testmod_uprobe_mutex);


[...]

Re: [PATCH v2 1/2] dt-bindings: remoteproc: Support multiple reserved memory regions

2024-07-31 Thread Krzysztof Kozlowski

On 31/07/2024 15:54, Krzysztof Kozlowski wrote:
> On 31/07/2024 15:41, Shun-Yi Wang (王順億) wrote:
>> Hi Krzysztof,
>>
>> Thanks for the reviews.
>>
>> On Wed, 2024-07-31 at 14:40 +0200, Krzysztof Kozlowski wrote:
>>>  
>>> External email : Please do not click links or open attachments until
>>> you have verified the sender or the content.
>>>  On 31/07/2024 14:17, Shun-yi Wang wrote:
 From: "shun-yi.wang" 

 Remove the maximum number of 1 for memory regions.
>>>
>>> Why?
>>>
>>
>> For future applications, MTK SCP will reserve multiple regions for
>> specific hardware use.
> 
> That's not a reason to drop constrain on an entry.

Maybe I was not that clear, so to explain more - entries must be
constrained, so provide widest constraints in top-level properties place
and narrow the constrains per variant in allOf:if:then: like:

https://elixir.bootlin.com/linux/v6.8/source/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml#L132

Best regards,
Krzysztof

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Peter Zijlstra

On Wed, Jul 31, 2024 at 09:18:00AM -0700, Andrii Nakryiko wrote:
> On Mon, Jul 29, 2024 at 6:45 AM Oleg Nesterov  wrote:
> >
> > This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *"
> > rather than inode + offset. This simplifies the code and allows to avoid
> > the unnecessary find_uprobe() + put_uprobe() in these functions.
> >
> > TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that
> > this uprobe can't be freed before up_write(&uprobe->register_rwsem).
> >
> > Signed-off-by: Oleg Nesterov 
> > Acked-by: Andrii Nakryiko 
> > ---
> >  include/linux/uprobes.h | 15 +-
> >  kernel/events/uprobes.c | 56 +++--
> >  kernel/trace/bpf_trace.c| 25 -
> >  kernel/trace/trace_uprobe.c | 26 -
> >  4 files changed, 55 insertions(+), 67 deletions(-)
> >
> 
> You'll need something like below to not break our bpf_testmod. And
> please send pull patch sets, not individually updated patches, it's a
> PITA to deal with. Thanks!

Do I stuff this on top of Oleg's patch or do you want me to fold it in
one of them?

> commit 9f739a9997ab833394196459fa7e6dd4d13dd48b (HEAD -> 
> uprobes-oleg-cleanups)
> Author: Andrii Nakryiko 
> Date:   Wed Jul 31 09:15:46 2024 -0700
> 
> uprobes: fix bpf_testmod after uprobe_register/uprobe_unregister API 
> change
> 
> Signed-off-by: Andrii Nakryiko 
> 
> diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> index 5f152afdec2f..73a6b041bcce 100644
> --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> @@ -431,6 +431,7 @@ uprobe_ret_handler(struct uprobe_consumer *self,
> unsigned long func,
>  }
> 
>  struct testmod_uprobe {
> +   struct uprobe *uprobe;
> struct path path;
> loff_t offset;
> struct uprobe_consumer consumer;
> @@ -458,12 +459,14 @@ static int testmod_register_uprobe(loff_t offset)
> if (err)
> goto out;
> 
> -   err = uprobe_register(d_real_inode(uprobe.path.dentry),
> - offset, 0, &uprobe.consumer);
> -   if (err)
> +   uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
> +   offset, 0, &uprobe.consumer);
> +   if (IS_ERR(uprobe.uprobe)) {
> path_put(&uprobe.path);
> -   else
> +   uprobe.uprobe = NULL;
> +   } else {
> uprobe.offset = offset;
> +   }
> 
>  out:
> mutex_unlock(&testmod_uprobe_mutex);
> @@ -474,10 +477,10 @@ static void testmod_unregister_uprobe(void)
>  {
> mutex_lock(&testmod_uprobe_mutex);
> 
> -   if (uprobe.offset) {
> -   uprobe_unregister(d_real_inode(uprobe.path.dentry),
> - uprobe.offset, &uprobe.consumer);
> +   if (uprobe.uprobe) {
> +   uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
> uprobe.offset = 0;
> +   uprobe.uprobe = NULL;
> }
> 
> mutex_unlock(&testmod_uprobe_mutex);
> 
> 
> [...]

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Andrii Nakryiko

On Wed, Jul 31, 2024 at 9:56 AM Peter Zijlstra  wrote:
>
> On Wed, Jul 31, 2024 at 09:18:00AM -0700, Andrii Nakryiko wrote:
> > On Mon, Jul 29, 2024 at 6:45 AM Oleg Nesterov  wrote:
> > >
> > > This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *"
> > > rather than inode + offset. This simplifies the code and allows to avoid
> > > the unnecessary find_uprobe() + put_uprobe() in these functions.
> > >
> > > TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that
> > > this uprobe can't be freed before up_write(&uprobe->register_rwsem).
> > >
> > > Signed-off-by: Oleg Nesterov 
> > > Acked-by: Andrii Nakryiko 
> > > ---
> > >  include/linux/uprobes.h | 15 +-
> > >  kernel/events/uprobes.c | 56 +++--
> > >  kernel/trace/bpf_trace.c| 25 -
> > >  kernel/trace/trace_uprobe.c | 26 -
> > >  4 files changed, 55 insertions(+), 67 deletions(-)
> > >
> >
> > You'll need something like below to not break our bpf_testmod. And
> > please send pull patch sets, not individually updated patches, it's a
> > PITA to deal with. Thanks!
>
> Do I stuff this on top of Oleg's patch or do you want me to fold it in
> one of them?

Please fold so we have better (potential) bisectability of BPF
selftests, thanks!

>
> > commit 9f739a9997ab833394196459fa7e6dd4d13dd48b (HEAD -> 
> > uprobes-oleg-cleanups)
> > Author: Andrii Nakryiko 
> > Date:   Wed Jul 31 09:15:46 2024 -0700
> >
> > uprobes: fix bpf_testmod after uprobe_register/uprobe_unregister API 
> > change
> >
> > Signed-off-by: Andrii Nakryiko 
> >
> > diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> > b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> > index 5f152afdec2f..73a6b041bcce 100644
> > --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> > +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
> > @@ -431,6 +431,7 @@ uprobe_ret_handler(struct uprobe_consumer *self,
> > unsigned long func,
> >  }
> >
> >  struct testmod_uprobe {
> > +   struct uprobe *uprobe;
> > struct path path;
> > loff_t offset;
> > struct uprobe_consumer consumer;
> > @@ -458,12 +459,14 @@ static int testmod_register_uprobe(loff_t offset)
> > if (err)
> > goto out;
> >
> > -   err = uprobe_register(d_real_inode(uprobe.path.dentry),
> > - offset, 0, &uprobe.consumer);
> > -   if (err)
> > +   uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
> > +   offset, 0, &uprobe.consumer);
> > +   if (IS_ERR(uprobe.uprobe)) {
> > path_put(&uprobe.path);
> > -   else
> > +   uprobe.uprobe = NULL;
> > +   } else {
> > uprobe.offset = offset;
> > +   }
> >
> >  out:
> > mutex_unlock(&testmod_uprobe_mutex);
> > @@ -474,10 +477,10 @@ static void testmod_unregister_uprobe(void)
> >  {
> > mutex_lock(&testmod_uprobe_mutex);
> >
> > -   if (uprobe.offset) {
> > -   uprobe_unregister(d_real_inode(uprobe.path.dentry),
> > - uprobe.offset, &uprobe.consumer);
> > +   if (uprobe.uprobe) {
> > +   uprobe_unregister(uprobe.uprobe, &uprobe.consumer);
> > uprobe.offset = 0;
> > +   uprobe.uprobe = NULL;
> > }
> >
> > mutex_unlock(&testmod_uprobe_mutex);
> >
> >
> > [...]

Re: [PATCH v4 1/2] rust: add static_key_false

2024-07-31 Thread Peter Zijlstra

On Fri, Jun 28, 2024 at 01:23:31PM +, Alice Ryhl wrote:

>  rust/kernel/arch/arm64/jump_label.rs | 34 
>  rust/kernel/arch/loongarch/jump_label.rs | 35 +
>  rust/kernel/arch/mod.rs  | 24 
>  rust/kernel/arch/riscv/jump_label.rs | 38 
> 
>  rust/kernel/arch/x86/jump_label.rs   | 35 +
>  rust/kernel/lib.rs   |  2 ++
>  rust/kernel/static_key.rs| 32 +++
>  scripts/Makefile.build   |  2 +-
>  8 files changed, 201 insertions(+), 1 deletion(-)

So I really find the amount of duplicated asm offensive. Is is far too
easy for any of this to get out of sync.

> diff --git a/rust/kernel/arch/x86/jump_label.rs 
> b/rust/kernel/arch/x86/jump_label.rs
> new file mode 100644
> index ..383bed273c50
> --- /dev/null
> +++ b/rust/kernel/arch/x86/jump_label.rs
> @@ -0,0 +1,35 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +// Copyright (C) 2024 Google LLC.
> +
> +//! X86 Rust implementation of jump_label.h
> +
> +/// x86 implementation of arch_static_branch
> +#[macro_export]
> +#[cfg(target_arch = "x86_64")]
> +macro_rules! arch_static_branch {
> +($key:path, $keytyp:ty, $field:ident, $branch:expr) => {'my_label: {
> +core::arch::asm!(
> +r#"
> +1: .byte 0x0f,0x1f,0x44,0x00,0x00
> +
> +.pushsection __jump_table,  "aw"
> +.balign 8
> +.long 1b - .
> +.long {0} - .
> +.quad {1} + {2} + {3} - .
> +.popsection
> +"#,
> +label {
> +break 'my_label true;
> +},
> +sym $key,
> +const ::core::mem::offset_of!($keytyp, $field),
> +const $crate::arch::bool_to_int($branch),
> +);
> +
> +break 'my_label false;
> +}};
> +}

Note that this uses the forced 5 byte version, and not the dynamic sized
one. On top of that it hard-codes the nop5 string :/

Please work harder to not have to duplicate stuff like this.

NAK.

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Peter Zijlstra

On Wed, Jul 31, 2024 at 10:01:47AM -0700, Andrii Nakryiko wrote:
> On Wed, Jul 31, 2024 at 9:56 AM Peter Zijlstra  wrote:
> >
> > On Wed, Jul 31, 2024 at 09:18:00AM -0700, Andrii Nakryiko wrote:
> > > On Mon, Jul 29, 2024 at 6:45 AM Oleg Nesterov  wrote:
> > > >
> > > > This way uprobe_unregister() and uprobe_apply() can use "struct uprobe 
> > > > *"
> > > > rather than inode + offset. This simplifies the code and allows to avoid
> > > > the unnecessary find_uprobe() + put_uprobe() in these functions.
> > > >
> > > > TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure 
> > > > that
> > > > this uprobe can't be freed before up_write(&uprobe->register_rwsem).
> > > >
> > > > Signed-off-by: Oleg Nesterov 
> > > > Acked-by: Andrii Nakryiko 
> > > > ---
> > > >  include/linux/uprobes.h | 15 +-
> > > >  kernel/events/uprobes.c | 56 +++--
> > > >  kernel/trace/bpf_trace.c| 25 -
> > > >  kernel/trace/trace_uprobe.c | 26 -
> > > >  4 files changed, 55 insertions(+), 67 deletions(-)
> > > >
> > >
> > > You'll need something like below to not break our bpf_testmod. And
> > > please send pull patch sets, not individually updated patches, it's a
> > > PITA to deal with. Thanks!
> >
> > Do I stuff this on top of Oleg's patch or do you want me to fold it in
> > one of them?
> 
> Please fold so we have better (potential) bisectability of BPF
> selftests, thanks!

Fold where, patch 5?

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Andrii Nakryiko

On Wed, Jul 31, 2024 at 10:05 AM Peter Zijlstra  wrote:
>
> On Wed, Jul 31, 2024 at 10:01:47AM -0700, Andrii Nakryiko wrote:
> > On Wed, Jul 31, 2024 at 9:56 AM Peter Zijlstra  wrote:
> > >
> > > On Wed, Jul 31, 2024 at 09:18:00AM -0700, Andrii Nakryiko wrote:
> > > > On Mon, Jul 29, 2024 at 6:45 AM Oleg Nesterov  wrote:
> > > > >
> > > > > This way uprobe_unregister() and uprobe_apply() can use "struct 
> > > > > uprobe *"
> > > > > rather than inode + offset. This simplifies the code and allows to 
> > > > > avoid
> > > > > the unnecessary find_uprobe() + put_uprobe() in these functions.
> > > > >
> > > > > TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure 
> > > > > that
> > > > > this uprobe can't be freed before up_write(&uprobe->register_rwsem).
> > > > >
> > > > > Signed-off-by: Oleg Nesterov 
> > > > > Acked-by: Andrii Nakryiko 
> > > > > ---
> > > > >  include/linux/uprobes.h | 15 +-
> > > > >  kernel/events/uprobes.c | 56 
> > > > > +++--
> > > > >  kernel/trace/bpf_trace.c| 25 -
> > > > >  kernel/trace/trace_uprobe.c | 26 -
> > > > >  4 files changed, 55 insertions(+), 67 deletions(-)
> > > > >
> > > >
> > > > You'll need something like below to not break our bpf_testmod. And
> > > > please send pull patch sets, not individually updated patches, it's a
> > > > PITA to deal with. Thanks!
> > >
> > > Do I stuff this on top of Oleg's patch or do you want me to fold it in
> > > one of them?
> >
> > Please fold so we have better (potential) bisectability of BPF
> > selftests, thanks!
>
> Fold where, patch 5?

Yep, this one, where Oleg changed uprobe_register/uprobe_unregister API.

But note, I did the lazy thing and just copy/pasted `git show` output
into Gmail. So all the whitespaces are butchered and unlikely you'll
be able to apply that patch as is. My expectation was that Oleg will
just incorporate that by hand and will send the final v4 patch set.

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Oleg Nesterov

On 07/31, Peter Zijlstra wrote:
>
> On Wed, Jul 31, 2024 at 10:01:47AM -0700, Andrii Nakryiko wrote:
> > > Do I stuff this on top of Oleg's patch or do you want me to fold it in
> > > one of them?
> >
> > Please fold so we have better (potential) bisectability of BPF
> > selftests, thanks!
>
> Fold where, patch 5?

Yes...

Or I can resend these 2 series as v3 1-8 with 5/5 updated (thanks Andrii !!!),
whatever is more convenient for you.

Oleg.

Re: [PATCH v2 5/5] uprobes: make uprobe_register() return struct uprobe *

2024-07-31 Thread Oleg Nesterov

On 07/31, Andrii Nakryiko wrote:
>
> My expectation was that Oleg will
> just incorporate that by hand and will send the final v4 patch set.

Will do tomorrow, thanks.

Oleg.

Re: [PATCH net-next v12 04/14] mm: page_frag: add '_va' suffix to page_frag API

2024-07-31 Thread Alexander Duyck

On Wed, Jul 31, 2024 at 5:50 AM Yunsheng Lin  wrote:
>
> Currently the page_frag API is returning 'virtual address'
> or 'va' when allocing and expecting 'virtual address' or
> 'va' as input when freeing.
>
> As we are about to support new use cases that the caller
> need to deal with 'struct page' or need to deal with both
> 'va' and 'struct page'. In order to differentiate the API
> handling between 'va' and 'struct page', add '_va' suffix
> to the corresponding API mirroring the page_pool_alloc_va()
> API of the page_pool. So that callers expecting to deal with
> va, page or both va and page may call page_frag_alloc_va*,
> page_frag_alloc_pg*, or page_frag_alloc* API accordingly.
>
> CC: Alexander Duyck 
> Signed-off-by: Yunsheng Lin 
> Reviewed-by: Subbaraya Sundeep 

I am naking this patch. It is a pointless rename that is just going to
obfuscate the git history for these callers.

As I believe I said before I would prefer to see this work more like
the handling of __get_free_pages and __free_pages in terms of the use
of pages versus pointers and/or longs. Pushing this API aside because
you want to reuse the name for something different isn't a valid
reason to rename an existing API and will just lead to confusion.

Re: [PATCH v4 2/2] rust: add tracepoint support

2024-07-31 Thread Benno Lossin

On 28.06.24 15:23, Alice Ryhl wrote:
> diff --git a/rust/kernel/tracepoint.rs b/rust/kernel/tracepoint.rs
> new file mode 100644
> index ..1005f09e0330
> --- /dev/null
> +++ b/rust/kernel/tracepoint.rs
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +// Copyright (C) 2024 Google LLC.
> +
> +//! Logic for tracepoints.
> +
> +/// Declare the Rust entry point for a tracepoint.
> +#[macro_export]
> +macro_rules! declare_trace {
> +($($(#[$attr:meta])* $pub:vis fn $name:ident($($argname:ident : 
> $argtyp:ty),* $(,)?);)*) => {$(
> +$( #[$attr] )*
> +#[inline(always)]
> +$pub unsafe fn $name($($argname : $argtyp),*) {
> +#[cfg(CONFIG_TRACEPOINTS)]
> +{
> +use $crate::bindings::*;

Why is this needed, can't you put this into the invocation of `paste!`?
ie `[< $crate::bindings::__tracepoint_ $name >]`?

> +
> +// SAFETY: It's always okay to query the static key for a 
> tracepoint.
> +let should_trace = unsafe {
> +$crate::macros::paste! {
> +$crate::static_key::static_key_false!(
> +[< __tracepoint_ $name >],
> +$crate::bindings::tracepoint,
> +key
> +)
> +}
> +};
> +
> +if should_trace {
> +$crate::macros::paste! {
> +// SAFETY: The caller guarantees that it is okay to 
> call this tracepoint.

Can you add this on the docs of `$name`? ie add a Safety section.
The docs should still appear when creating them/when LSPs show them to
you.

---
Cheers,
Benno

> +unsafe { [< rust_do_trace_ $name >]($($argname),*) };
> +}
> +}
> +}
> +
> +#[cfg(not(CONFIG_TRACEPOINTS))]
> +{
> +// If tracepoints are disabled, insert a trivial use of each 
> argument
> +// to avoid unused argument warnings.
> +$( let _unused = $argname; )*
> +}
> +}
> +)*}
> +}
> +
> +pub use declare_trace;
> 
> --
> 2.45.2.803.g4e1b14247a-goog
>

[PATCH] remoteproc: Use of_property_present()

2024-07-31 Thread Rob Herring (Arm)

Use of_property_present() to test for property presence rather than
of_(find|get)_property(). This is part of a larger effort to remove
callers of of_find_property() and similar functions. of_find_property()
leaks the DT struct property and data pointers which is a problem for
dynamically allocated nodes which may be freed.

Signed-off-by: Rob Herring (Arm) 
---
 drivers/remoteproc/imx_dsp_rproc.c  | 2 +-
 drivers/remoteproc/imx_rproc.c  | 2 +-
 drivers/remoteproc/xlnx_r5_remoteproc.c | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/remoteproc/imx_dsp_rproc.c 
b/drivers/remoteproc/imx_dsp_rproc.c
index 087506e21508..376187ad5754 100644
--- a/drivers/remoteproc/imx_dsp_rproc.c
+++ b/drivers/remoteproc/imx_dsp_rproc.c
@@ -509,7 +509,7 @@ static int imx_dsp_rproc_mbox_alloc(struct imx_dsp_rproc 
*priv)
struct mbox_client *cl;
int ret;
 
-   if (!of_get_property(dev->of_node, "mbox-names", NULL))
+   if (!of_property_present(dev->of_node, "mbox-names"))
return 0;
 
cl = &priv->cl;
diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c
index 144c8e9a642e..8d7ecc809c67 100644
--- a/drivers/remoteproc/imx_rproc.c
+++ b/drivers/remoteproc/imx_rproc.c
@@ -807,7 +807,7 @@ static int imx_rproc_xtr_mbox_init(struct rproc *rproc)
if (priv->tx_ch && priv->rx_ch)
return 0;
 
-   if (!of_get_property(dev->of_node, "mbox-names", NULL))
+   if (!of_property_present(dev->of_node, "mbox-names"))
return 0;
 
cl = &priv->cl;
diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
b/drivers/remoteproc/xlnx_r5_remoteproc.c
index 596f3ffb8935..2cea97c746fd 100644
--- a/drivers/remoteproc/xlnx_r5_remoteproc.c
+++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
@@ -1059,7 +1059,7 @@ static int zynqmp_r5_core_init(struct zynqmp_r5_cluster 
*cluster,
r5_core = cluster->r5_cores[0];
 
/* Maintain backward compatibility for zynqmp by using hardcode TCM 
address. */
-   if (of_find_property(r5_core->np, "reg", NULL))
+   if (of_property_present(r5_core->np, "reg"))
ret = zynqmp_r5_get_tcm_node_from_dt(cluster);
else if (device_is_compatible(dev, "xlnx,zynqmp-r5fss"))
ret = zynqmp_r5_get_tcm_node(cluster);
@@ -1086,7 +1086,7 @@ static int zynqmp_r5_core_init(struct zynqmp_r5_cluster 
*cluster,
return ret;
}
 
-   if (of_find_property(dev_of_node(dev), "xlnx,tcm-mode", NULL) ||
+   if (of_property_present(dev_of_node(dev), "xlnx,tcm-mode") ||
device_is_compatible(dev, "xlnx,zynqmp-r5fss")) {
ret = zynqmp_pm_set_tcm_config(r5_core->pm_domain_id,
   tcm_mode);
@@ -1147,7 +1147,7 @@ static int zynqmp_r5_cluster_init(struct 
zynqmp_r5_cluster *cluster)
return -EINVAL;
}
 
-   if (of_find_property(dev_node, "xlnx,tcm-mode", NULL)) {
+   if (of_property_present(dev_node, "xlnx,tcm-mode")) {
ret = of_property_read_u32(dev_node, "xlnx,tcm-mode", (u32 
*)&tcm_mode);
if (ret)
return ret;
-- 
2.43.0

[PATCH] nvdimm: Use of_property_present() and of_property_read_bool()

2024-07-31 Thread Rob Herring (Arm)

Use of_property_present() and of_property_read_bool() to test
property presence and read boolean properties rather than
of_(find|get)_property(). This is part of a larger effort to remove
callers of of_find_property() and similar functions.
of_(find|get)_property() leak the DT struct property and data pointers
which is a problem for dynamically allocated nodes which may be freed.

Signed-off-by: Rob Herring (Arm) 
---
 drivers/nvdimm/of_pmem.c | 2 +-
 drivers/nvmem/layouts.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
index 403384f25ce3..b4a1cf70e8b7 100644
--- a/drivers/nvdimm/of_pmem.c
+++ b/drivers/nvdimm/of_pmem.c
@@ -47,7 +47,7 @@ static int of_pmem_region_probe(struct platform_device *pdev)
}
platform_set_drvdata(pdev, priv);
 
-   is_volatile = !!of_find_property(np, "volatile", NULL);
+   is_volatile = of_property_read_bool(np, "volatile");
dev_dbg(&pdev->dev, "Registering %s regions from %pOF\n",
is_volatile ? "volatile" : "non-volatile",  np);
 
diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c
index 77a4119efea8..65d39e19f6ec 100644
--- a/drivers/nvmem/layouts.c
+++ b/drivers/nvmem/layouts.c
@@ -123,7 +123,7 @@ static int nvmem_layout_bus_populate(struct nvmem_device 
*nvmem,
int ret;
 
/* Make sure it has a compatible property */
-   if (!of_get_property(layout_dn, "compatible", NULL)) {
+   if (!of_property_present(layout_dn, "compatible")) {
pr_debug("%s() - skipping %pOF, no compatible prop\n",
 __func__, layout_dn);
return 0;
-- 
2.43.0

Re: [PATCH v2] x86/cpu: Adjust the error message when BIOS does not support SGX

2024-07-31 Thread Thomas Gleixner

On Wed, Jul 31 2024 at 23:30, wangy...@uniontech.com wrote:
> When SGX is not supported by the BIOS, the kernel log still output
> the error 'SGX disabled by BIOS', which can be confusing since
> there might not be an SGX-related option in the BIOS settings.
>
> As a kernel, it's difficult to distinguish between the BIOS not
> supporting SGX and the BIOS supporting SGX but it's disabled.
>
> Therefore, update the error message to
> 'SGX disabled or unsupported by BIOS' to make it easier for those
> reading kernel logs to understand what's happening.
>
> Reported-by: Bo Wu 
> Link: https://github.com/linuxdeepin/developer-center/issues/10032
> Reviewed-by: Kai Huang 
> Link: 
> https://lore.kernel.org/all/a30f7700c7817b3e7e2f2bdb37d5c10a318b2c3b.ca...@intel.com/
> Signed-off-by: Zelong Xiang 
> Signed-off-by: WangYuli 

This Signed-off-by chain is invalid. See:

https://www.kernel.org/doc/html/latest/process/submitting-patches.html#sign-your-work-the-developer-s-certificate-of-origin

Thanks,

tglx

Re: [PATCH 00/15] Implement MODVERSIONS for Rust

2024-07-31 Thread Neal Gompa

On Friday, July 26, 2024 5:05:22 PM EDT Sami Tolvanen wrote:
> Hi Petr,
> 
> On Mon, Jul 22, 2024 at 8:20 AM Petr Pavlu  wrote:
> > From my perspective, I'm okay if gendwarfksyms doesn't provide
> > functionality to compare a new object file with its reference symtypes
> > file.
> > 
> > As mentioned, genksyms has this functionality but I actually think the
> > way it works is not ideal. Its design is to operate on one compilation
> > unit at the time. This has the advantage that a comparison of each file
> > is performed in parallel during the build, simply because of the make
> > job system. On the other hand, it has two problems.
> > 
> > The first one is that genksyms doesn't provide a comparison of the
> > kernel as a whole. This means that the tool gives rather scattered and
> > duplicated output about changed structs in the build log. Ideally, one
> > would like to see a single compact report about what changed at the end
> > of the build.
> 
> Sure, that makes sense. Android uses STG for this, which might be
> useful to other folks too:
> 
> https://android.googlesource.com/platform/external/stg/
> https://android.googlesource.com/platform/external/stg/+/refs/heads/main/doc
> /stgdiff.md#output-formats
> > A few months ago, I also started working on a tool inspired by this
> > script. The goal is to have similar functionality but hopefully with
> > a much faster implementation. Hence, this tool is written in a compiled
> > language (Rust at the moment) and should also become multi-threaded. I'm
> > hoping to find some time to make progress on it and make the code
> > public. It could later be added to the upstream kernel to replace the
> > comparison functionality implemented by genksyms, if there is interest.
> > 
> > So as mentioned, I'm fine if gendwarfksyms doesn't have this
> > functionality. However, for distributions that rely on the symtypes
> > format, I'd be interested in having gendwarfksyms output its dump data
> > in this format as well.
> 
> We can definitely tweak the output format, but I'm not sure if making
> it fully compatible with the genksyms symtypes format is feasible,
> especially for Rust code. I also intentionally decided to use DWARF
> tag names in the output instead of shorthands like s# etc. to make it
> a bit more readable.
> 
> > For example, instead of producing:
> > 
> > gendwarfksyms: process_exported_symbols: _some_mangled_func_name (@ XYZ)
> > subprogram(
> > 
> >[formal parameters...]
> > 
> > )
> > -> structure_type core::result::Result<(), core::fmt::Error> {
> > 
> >[a description of the structure...]
> > 
> > };
> > 
> > .. the output could be something like this:
> > 
> > S#'core::result::Result<(), core::fmt::Error>' structure_type
> > core::result::Result<(), core::fmt::Error> { [a description of the
> > structure...] } _some_mangled_func_name subprogram
> > _some_mangled_func_name ( [formal parameters...] ) ->
> > S#'core::result::Result<(), core::fmt::Error>'
> This wouldn't be enough to make the output format compatible with
> symtypes though. genksyms basically produces a simple key-value pair
> database while gendwarfksyms currently outputs the fully expanded type
> string for each symbol. If you need the tool to produce a type
> database, it might also be worth discussing if we should use a bit
> less ad hoc format in that case.
> 
> One more thing to note about the current --debug output is that it
> directly correlates with the debugging information and thus may not
> contain all aliases. For example, the Rust compiler deduplicates
> identical function implementations (e.g. Deref::deref and
> DerefMut::deref_mut etc.), but only one of the symbol names appears in
> DWARF. We use symbol addresses to print out #SYMVERs also for the
> aliases, but they don't show up in the debugging output right now.
> 
> > > If using unions here is acceptable to everyone, a simple solution
> > > would be to use a known name prefix for the reserved members and teach
> > > gendwarfksyms to only print out the original type for the replaced
> > > ones. For example:
> > > 
> > > The initial placeholder:
> > > u8 __kabi_reserved_1[8];
> > > 
> > > After replacement:
> > > union {
> > > 
> > > u64 new_member;
> > > struct {
> > > 
> > > u8 __kabi_reserved_1[8];
> > > 
> > > };
> > > 
> > > }
> > > 
> > > Here gendwarfksyms would see the __kabi_reserved prefix and only use
> > > u8 [8] for the CRC calculation. Does this sound reasonable?
> > 
> > I like this idea. I think it's good that the necessary kABI information
> > about an updated member can be expressed at the source code level in
> > place of the actual change, and it isn't needed to feed additional input
> > to the tool.
> 
> OK, cool. I agree that being able to specify these details in source
> code is much cleaner. I'll add an implementation for this, and for the
> definition visibility issue Greg mentioned

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Michael S. Tsirkin

On Wed, Jul 31, 2024 at 10:59:47AM +0800, Jason Wang wrote:
> This patch synchronize operstate with admin state per RFC2863.
> 
> This is done by trying to toggle the carrier upon open/close and
> synchronize with the config change work. This allows propagate status
> correctly to stacked devices like:
> 
> ip link add link enp0s3 macvlan0 type macvlan
> ip link set link enp0s3 down
> ip link show
> 
> Before this patch:
> 
> 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> DEFAULT group default qlen 1000
> link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> ..
> 5: macvlan0@enp0s3:  mtu 1500 qdisc 
> noqueue state UP mode DEFAULT group default qlen 1000
> link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> 
> After this patch:
> 
> 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> DEFAULT group default qlen 1000
> link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> ...
> 5: macvlan0@enp0s3:  mtu 1500 qdisc 
> noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
> link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> 
> Cc: Venkat Venkatsubra 
> Cc: Gia-Khanh Nguyen 
> Signed-off-by: Jason Wang 

Changelog?

> ---
>  drivers/net/virtio_net.c | 84 ++--
>  1 file changed, 54 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0383a3e136d6..0cb93261eba1 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2878,6 +2878,7 @@ static int virtnet_enable_queue_pair(struct 
> virtnet_info *vi, int qp_index)
>   return err;
>  }
>  
> +
>  static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim)
>  {
>   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))

hmm

> @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct virtnet_info 
> *vi, struct dim *dim)
>   net_dim_work_cancel(dim);
>  }
>  
> +static void virtnet_update_settings(struct virtnet_info *vi)
> +{
> + u32 speed;
> + u8 duplex;
> +
> + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> + return;
> +
> + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> +
> + if (ethtool_validate_speed(speed))
> + vi->speed = speed;
> +
> + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> +
> + if (ethtool_validate_duplex(duplex))
> + vi->duplex = duplex;
> +}
> +

I already commented on this approach.  This is now invoked on each open,
lots of extra VM exits. No bueno, people are working hard to keep setup
overhead under control. Handle this in the config change interrupt -
your new infrastructure is perfect for this.


>  static int virtnet_open(struct net_device *dev)
>  {
>   struct virtnet_info *vi = netdev_priv(dev);
> @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
>   goto err_enable_qp;
>   }
>  
> + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> + if (vi->status & VIRTIO_NET_S_LINK_UP)
> + netif_carrier_on(vi->dev);
> + virtio_config_driver_enable(vi->vdev);
> + } else {
> + vi->status = VIRTIO_NET_S_LINK_UP;
> + netif_carrier_on(dev);
> + virtnet_update_settings(vi);
> + }
> +
>   return 0;
>  
>  err_enable_qp:
> @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
>   disable_delayed_refill(vi);
>   /* Make sure refill_work doesn't re-enable napi! */
>   cancel_delayed_work_sync(&vi->refill);
> + /* Make sure config notification doesn't schedule config work */

it's clear what this does even without a comment.
what you should comment on, and do not, is *why*.

> + virtio_config_driver_disable(vi->vdev);
> + /* Make sure status updating is cancelled */

same

also what "status updating"? confuses more than this clarifies.

> + cancel_work_sync(&vi->config_work);
>  
>   for (i = 0; i < vi->max_queue_pairs; i++) {
>   virtnet_disable_queue_pair(vi, i);
>   virtnet_cancel_dim(vi, &vi->rq[i].dim);
>   }
>  
> + netif_carrier_off(dev);
> +
>   return 0;
>  }
>  
> @@ -5085,25 +5121,6 @@ static void virtnet_init_settings(struct net_device 
> *dev)
>   vi->duplex = DUPLEX_UNKNOWN;
>  }
>  
> -static void virtnet_update_settings(struct virtnet_info *vi)
> -{
> - u32 speed;
> - u8 duplex;
> -
> - if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> - return;
> -
> - virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> -
> - if (ethtool_validate_speed(speed))
> - vi->speed = speed;
> -
> - virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> -
> - if (ethtool_validate_duplex(duplex))
> - vi->duplex = duplex;
> -}
> -
>  static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
>  {
>

Re: [PATCH v4 1/2] rust: add static_key_false

2024-07-31 Thread Alice Ryhl

On Wed, Jul 31, 2024 at 7:05 PM Peter Zijlstra  wrote:
>
> On Fri, Jun 28, 2024 at 01:23:31PM +, Alice Ryhl wrote:
>
> >  rust/kernel/arch/arm64/jump_label.rs | 34 
> >  rust/kernel/arch/loongarch/jump_label.rs | 35 +
> >  rust/kernel/arch/mod.rs  | 24 
> >  rust/kernel/arch/riscv/jump_label.rs | 38 
> > 
> >  rust/kernel/arch/x86/jump_label.rs   | 35 +
> >  rust/kernel/lib.rs   |  2 ++
> >  rust/kernel/static_key.rs| 32 +++
> >  scripts/Makefile.build   |  2 +-
> >  8 files changed, 201 insertions(+), 1 deletion(-)
>
> So I really find the amount of duplicated asm offensive. Is is far too
> easy for any of this to get out of sync.
>
> > diff --git a/rust/kernel/arch/x86/jump_label.rs 
> > b/rust/kernel/arch/x86/jump_label.rs
> > new file mode 100644
> > index ..383bed273c50
> > --- /dev/null
> > +++ b/rust/kernel/arch/x86/jump_label.rs
> > @@ -0,0 +1,35 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +// Copyright (C) 2024 Google LLC.
> > +
> > +//! X86 Rust implementation of jump_label.h
> > +
> > +/// x86 implementation of arch_static_branch
> > +#[macro_export]
> > +#[cfg(target_arch = "x86_64")]
> > +macro_rules! arch_static_branch {
> > +($key:path, $keytyp:ty, $field:ident, $branch:expr) => {'my_label: {
> > +core::arch::asm!(
> > +r#"
> > +1: .byte 0x0f,0x1f,0x44,0x00,0x00
> > +
> > +.pushsection __jump_table,  "aw"
> > +.balign 8
> > +.long 1b - .
> > +.long {0} - .
> > +.quad {1} + {2} + {3} - .
> > +.popsection
> > +"#,
> > +label {
> > +break 'my_label true;
> > +},
> > +sym $key,
> > +const ::core::mem::offset_of!($keytyp, $field),
> > +const $crate::arch::bool_to_int($branch),
> > +);
> > +
> > +break 'my_label false;
> > +}};
> > +}
>
> Note that this uses the forced 5 byte version, and not the dynamic sized
> one. On top of that it hard-codes the nop5 string :/
>
> Please work harder to not have to duplicate stuff like this.

I really didn't want to duplicate it, but it's very hard to find a
performant alternative. Is there any way we could accept duplication
only in the cases where an 'i' parameter is used? I don't have the
choice of using a Rust helper for 'i' parameters.

Perhaps one option could be to put the Rust code inside jump_label.h
and have the header file evaluate to either C or Rust depending on the
value of some #ifdefs?

#ifndef RUST_ASM
/* existing C code goes here */
#endif
#ifdef RUST_ASM
// rust code goes here
#endif

That way the duplication is all in a single file. It would also avoid
the need for duplicating the nop5 string, as the Rust case is still
going through the C preprocessor and can use the existing #define.

I'm also open to other alternatives. But I don't have infinite
resources to drive major language changes.

Alice

[PATCH 0/8] uprobes: RCU-protected hot path optimizations

2024-07-31 Thread Andrii Nakryiko

This patch set is heavily inspired by Peter Zijlstra's uprobe optimization
patches ([0]) and continue that work, albeit trying to keep complexity to the
minimum, and attepting to reuse existing primitives as much as possible. The
goal here is to optimize obvious uprobe triggering hot path, while keeping the
rest of locking mostly intact.

I've reused rb_find_rcu()/rb_find_add_rcu() patches as is, and the "split
uprobe_unregister()" is mostly intact, but I've added uprobe_unregister_sync()
into the error handling code path inside uprobe_unregister(). This is due to
recent refactorings from Oleg Nesterov ([1]), which necessitates this
addition. I'm not sure I got Co-Developed-by/SOB pieces right, for which
I apoligize in advance.

Except for refcounting change patch (which I stongly believe is a good
improvement we should do and forget about quasi-refcounting schema of
uprobe->consumers list), the rest of the changes are similar to Peter's
initial changes in [0].

Main differences would be:
  - no special RCU protection for mmap and fork handling, we just stick to
refcounts there, as those are infrequent and not performance-sensitive
code, while being complex and thus benefiting from proper locking;
  - the above means we don't need to do any custom SRCU additions to handle
forking code path;
  - I handled UPROBE_HANDLER_REMOVE problem in handler_chain() differently,
again, leveraging existing locking scheam;
  - I kept refcount usage for uretprobe and single-stepping uprobes, I plan to
address that in a separate follow up patches. The plan is to avoid
task_work, but I need to sit down and write and test the code.
  - finally, I dutifully was using SRCU throughout all the changes, and only
last patch switches SRCU to RCU Tasks Trace and demonstrates significant
performance and scalability gains from this.

The changes in this patch set were tested using BPF selftests and using
uprobe-stress ([2]) tool. One recent BPF selftest (uprobe_multi/consumers),
only recently added by Jiri Olsa will need a single-line adjustment to the
counting logic, but the patch itself is in bpf-next/master, so we'll have to
address that once linux-trace or tip and bpf-next trees merge. I'll take care
of that when this happens.

Now, for the benchmarking results. I've used the following script (which
utilizes BPF selftests-based bench tool). The CPU used was 80-core Intel Xeon
Gold 6138 CPU @ 2.00GHz running kernel with production-like config. I minimized
background noise by stopping any service I could identify and stop, so results
are pretty stable and variability is pretty small, overall.

Benchmark script:

#!/bin/bash

set -eufo pipefail

for i in uprobe-nop uretprobe-nop; do
for p in 1 2 4 8 16 32 64; do
summary=$(sudo ./bench -w3 -d5 -p$p -a trig-$i | tail -n1)
total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-)
percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' 
-f1)
printf "%-15s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu"
done
echo
done

With all the lock-avoiding changes done in this patch set, we get a pretty
decent improvement in performance and scalability of uprobes with number of
CPUs, even though we are still nowhere near linear scalability. This is due to
the remaning mmap_lock, which is currently taken to resolve interrupt address
to inode+offset and then uprobe instance. And, of course, uretprobes still need
similar RCU to avoid refcount in the hot path, which will be addressed in the
follow up patches.

BASELINE (on top of Oleg's clean up patches)

uprobe-nop  ( 1 cpus):3.032 ± 0.023M/s  (  3.032M/s/cpu)
uprobe-nop  ( 2 cpus):3.452 ± 0.005M/s  (  1.726M/s/cpu)
uprobe-nop  ( 4 cpus):3.663 ± 0.005M/s  (  0.916M/s/cpu)
uprobe-nop  ( 8 cpus):3.718 ± 0.038M/s  (  0.465M/s/cpu)
uprobe-nop  (16 cpus):3.344 ± 0.008M/s  (  0.209M/s/cpu)
uprobe-nop  (32 cpus):2.288 ± 0.021M/s  (  0.071M/s/cpu)
uprobe-nop  (64 cpus):3.205 ± 0.004M/s  (  0.050M/s/cpu)

uretprobe-nop   ( 1 cpus):1.979 ± 0.005M/s  (  1.979M/s/cpu)
uretprobe-nop   ( 2 cpus):2.361 ± 0.005M/s  (  1.180M/s/cpu)
uretprobe-nop   ( 4 cpus):2.309 ± 0.002M/s  (  0.577M/s/cpu)
uretprobe-nop   ( 8 cpus):2.253 ± 0.001M/s  (  0.282M/s/cpu)
uretprobe-nop   (16 cpus):2.007 ± 0.000M/s  (  0.125M/s/cpu)
uretprobe-nop   (32 cpus):1.624 ± 0.003M/s  (  0.051M/s/cpu)
uretprobe-nop   (64 cpus):2.149 ± 0.001M/s  (  0.034M/s/cpu)

Up to second-to-last patch (i.e., SRCU-based optimizations)
===
uprobe-nop  ( 1 cpus):3.276 ± 0.005M/s  (  3.276M/s/cpu)
uprobe-nop  ( 2 cpus):4.125 ± 0.002M/s  (  2.063M/s/cpu)
uprobe-nop  ( 4 cpus):7.713 ± 0.002M/s  (  1.928M/s/cpu)
uprobe-nop  ( 8 cpus):8.097 ± 0.006M/s  (  1.012M/s/cpu)
uprobe-nop  (16 cpus):6.501 ± 0.056M/s

[PATCH 1/8] rbtree: provide rb_find_rcu() / rb_find_add_rcu()

2024-07-31 Thread Andrii Nakryiko

From: Peter Zijlstra 

Much like latch_tree, add two RCU methods for the regular RB-tree,
which can be used in conjunction with a seqcount to provide lockless
lookups.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Masami Hiramatsu (Google) 
---
 include/linux/rbtree.h | 67 ++
 1 file changed, 67 insertions(+)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f7edca369eda..7c173aa64e1e 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -244,6 +244,42 @@ rb_find_add(struct rb_node *node, struct rb_root *tree,
return NULL;
 }
 
+/**
+ * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Adds a Store-Release for link_node.
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
+   int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+   struct rb_node **link = &tree->rb_node;
+   struct rb_node *parent = NULL;
+   int c;
+
+   while (*link) {
+   parent = *link;
+   c = cmp(node, parent);
+
+   if (c < 0)
+   link = &parent->rb_left;
+   else if (c > 0)
+   link = &parent->rb_right;
+   else
+   return parent;
+   }
+
+   rb_link_node_rcu(node, parent, link);
+   rb_insert_color(node, tree);
+   return NULL;
+}
+
 /**
  * rb_find() - find @key in tree @tree
  * @key: key to match
@@ -272,6 +308,37 @@ rb_find(const void *key, const struct rb_root *tree,
return NULL;
 }
 
+/**
+ * rb_find_rcu() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Notably, tree descent vs concurrent tree rotations is unsound and can result
+ * in false-negatives.
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_rcu(const void *key, const struct rb_root *tree,
+   int (*cmp)(const void *key, const struct rb_node *))
+{
+   struct rb_node *node = tree->rb_node;
+
+   while (node) {
+   int c = cmp(key, node);
+
+   if (c < 0)
+   node = rcu_dereference_raw(node->rb_left);
+   else if (c > 0)
+   node = rcu_dereference_raw(node->rb_right);
+   else
+   return node;
+   }
+
+   return NULL;
+}
+
 /**
  * rb_find_first() - find the first @key in @tree
  * @key: key to match
-- 
2.43.0

[PATCH 2/8] uprobes: revamp uprobe refcounting and lifetime management

2024-07-31 Thread Andrii Nakryiko

Revamp how struct uprobe is refcounted, and thus how its lifetime is
managed.

Right now, there are a few possible "owners" of uprobe refcount:
  - uprobes_tree RB tree assumes one refcount when uprobe is registered
and added to the lookup tree;
  - while uprobe is triggered and kernel is handling it in the breakpoint
handler code, temporary refcount bump is done to keep uprobe from
being freed;
  - if we have uretprobe requested on a given struct uprobe instance, we
take another refcount to keep uprobe alive until user space code
returns from the function and triggers return handler.

The uprobe_tree's extra refcount of 1 is confusing and problematic. No
matter how many actual consumers are attached, they all share the same
refcount, and we have an extra logic to drop the "last" (which might not
really be last) refcount once uprobe's consumer list becomes empty.

This is unconventional and has to be kept in mind as a special case all
the time. Further, because of this design we have the situations where
find_uprobe() will find uprobe, bump refcount, return it to the caller,
but that uprobe will still need uprobe_is_active() check, after which
the caller is required to drop refcount and try again. This is just too
many details leaking to the higher level logic.

This patch changes refcounting scheme in such a way as to not have
uprobes_tree keeping extra refcount for struct uprobe. Instead, each
uprobe_consumer is assuming its own refcount, which will be dropped
when consumer is unregistered. Other than that, all the active users of
uprobe (entry and return uprobe handling code) keeps exactly the same
refcounting approach.

With the above setup, once uprobe's refcount drops to zero, we need to
make sure that uprobe's "destructor" removes uprobe from uprobes_tree,
of course. This, though, races with uprobe entry handling code in
handle_swbp(), which, through find_active_uprobe()->find_uprobe() lookup,
can race with uprobe being destroyed after refcount drops to zero (e.g.,
due to uprobe_consumer unregistering). So we add try_get_uprobe(), which
will attempt to bump refcount, unless it already is zero. Caller needs
to guarantee that uprobe instance won't be freed in parallel, which is
the case while we keep uprobes_treelock (for read or write, doesn't
matter).

Note also, we now don't leak the race between registration and
unregistration, so we remove the retry logic completely. If
find_uprobe() returns valid uprobe, it's guaranteed to remain in
uprobes_tree with properly incremented refcount. The race is handled
inside __insert_uprobe() and put_uprobe() working together:
__insert_uprobe() will remove uprobe from RB-tree, if it can't bump
refcount and will retry to insert the new uprobe instance. put_uprobe()
won't attempt to remove uprobe from RB-tree, if it's already not there.
All that is protected by uprobes_treelock, which keeps things simple.

Signed-off-by: Andrii Nakryiko 
---
 kernel/events/uprobes.c | 163 +++-
 1 file changed, 93 insertions(+), 70 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f88b7ff20587..23dde3ec5b09 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -587,25 +587,51 @@ set_orig_insn(struct arch_uprobe *auprobe, struct 
mm_struct *mm, unsigned long v
*(uprobe_opcode_t *)&auprobe->insn);
 }
 
+/* uprobe should have guaranteed positive refcount */
 static struct uprobe *get_uprobe(struct uprobe *uprobe)
 {
refcount_inc(&uprobe->ref);
return uprobe;
 }
 
+/*
+ * uprobe should have guaranteed lifetime, which can be either of:
+ *   - caller already has refcount taken (and wants an extra one);
+ *   - uprobe is RCU protected and won't be freed until after grace period;
+ *   - we are holding uprobes_treelock (for read or write, doesn't matter).
+ */
+static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
+{
+   if (refcount_inc_not_zero(&uprobe->ref))
+   return uprobe;
+   return NULL;
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+   return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+
 static void put_uprobe(struct uprobe *uprobe)
 {
-   if (refcount_dec_and_test(&uprobe->ref)) {
-   /*
-* If application munmap(exec_vma) before uprobe_unregister()
-* gets called, we don't get a chance to remove uprobe from
-* delayed_uprobe_list from remove_breakpoint(). Do it here.
-*/
-   mutex_lock(&delayed_uprobe_lock);
-   delayed_uprobe_remove(uprobe, NULL);
-   mutex_unlock(&delayed_uprobe_lock);
-   kfree(uprobe);
-   }
+   if (!refcount_dec_and_test(&uprobe->ref))
+   return;
+
+   write_lock(&uprobes_treelock);
+
+   if (uprobe_is_active(uprobe))
+   rb_erase(&uprobe->rb_node, &uprobes_tree);
+
+   write_unlock(&uprobes_treelock

[PATCH 3/8] uprobes: protected uprobe lifetime with SRCU

2024-07-31 Thread Andrii Nakryiko

To avoid unnecessarily taking a (brief) refcount on uprobe during
breakpoint handling in handle_swbp for entry uprobes, make find_uprobe()
not take refcount, but protect the lifetime of a uprobe instance with
RCU. This improves scalability, as refcount gets quite expensive due to
cache line bouncing between multiple CPUs.

Specifically, we utilize our own uprobe-specific SRCU instance for this
RCU protection. put_uprobe() will delay actual kfree() using call_srcu().

For now, uretprobe and single-stepping handling will still acquire
refcount as necessary. We'll address these issues in follow up patches
by making them use SRCU with timeout.

Signed-off-by: Andrii Nakryiko 
---
 kernel/events/uprobes.c | 93 -
 1 file changed, 55 insertions(+), 38 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 23dde3ec5b09..6d5c3f4b210f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -41,6 +41,8 @@ static struct rb_root uprobes_tree = RB_ROOT;
 
 static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
 
+DEFINE_STATIC_SRCU(uprobes_srcu);
+
 #define UPROBES_HASH_SZ13
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -59,6 +61,7 @@ struct uprobe {
struct list_headpending_list;
struct uprobe_consumer  *consumers;
struct inode*inode; /* Also hold a ref to inode */
+   struct rcu_head rcu;
loff_t  offset;
loff_t  ref_ctr_offset;
unsigned long   flags;
@@ -612,6 +615,13 @@ static inline bool uprobe_is_active(struct uprobe *uprobe)
return !RB_EMPTY_NODE(&uprobe->rb_node);
 }
 
+static void uprobe_free_rcu(struct rcu_head *rcu)
+{
+   struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+   kfree(uprobe);
+}
+
 static void put_uprobe(struct uprobe *uprobe)
 {
if (!refcount_dec_and_test(&uprobe->ref))
@@ -632,6 +642,8 @@ static void put_uprobe(struct uprobe *uprobe)
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
+
+   call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
 }
 
 static __always_inline
@@ -673,33 +685,25 @@ static inline int __uprobe_cmp(struct rb_node *a, const 
struct rb_node *b)
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
 }
 
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+/*
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
+ */
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
 {
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
-   struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
-
-   if (node)
-   return try_get_uprobe(__node_2_uprobe(node));
+   struct rb_node *node;
 
-   return NULL;
-}
-
-/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
- */
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
-{
-   struct uprobe *uprobe;
+   lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
 
read_lock(&uprobes_treelock);
-   uprobe = __find_uprobe(inode, offset);
+   node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
read_unlock(&uprobes_treelock);
 
-   return uprobe;
+   return node ? __node_2_uprobe(node) : NULL;
 }
 
 /*
@@ -1073,10 +1077,10 @@ register_for_each_vma(struct uprobe *uprobe, struct 
uprobe_consumer *new)
goto free;
/*
 * We take mmap_lock for writing to avoid the race with
-* find_active_uprobe() which takes mmap_lock for reading.
+* find_active_uprobe_rcu() which takes mmap_lock for reading.
 * Thus this install_breakpoint() can not make
-* is_trap_at_addr() true right after find_uprobe()
-* returns NULL in find_active_uprobe().
+* is_trap_at_addr() true right after find_uprobe_rcu()
+* returns NULL in find_active_uprobe_rcu().
 */
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
@@ -1885,9 +1889,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
struct pt_regs *regs)
return;
}
 
+   /* we need to bump refcount to store uprobe in utask */
+   if (!try_get_uprobe(uprobe))
+   return;
+
ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
-   return;
+   goto fail;
 
trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, 
regs);
@@ -1914,11 +1922,7 @@ sta

[PATCH 4/8] uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks

2024-07-31 Thread Andrii Nakryiko

It serves no purpose beyond adding unnecessray argument passed to the
filter callback. Just get rid of it, no one is actually using it.

Signed-off-by: Andrii Nakryiko 
---
 include/linux/uprobes.h | 10 +-
 kernel/events/uprobes.c | 18 +++---
 kernel/trace/bpf_trace.c|  3 +--
 kernel/trace/trace_uprobe.c |  9 +++--
 4 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 137ddfc0b2f8..8d5bbad2048c 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -28,20 +28,12 @@ struct page;
 
 #define MAX_URETPROBE_DEPTH64
 
-enum uprobe_filter_ctx {
-   UPROBE_FILTER_REGISTER,
-   UPROBE_FILTER_UNREGISTER,
-   UPROBE_FILTER_MMAP,
-};
-
 struct uprobe_consumer {
int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
int (*ret_handler)(struct uprobe_consumer *self,
unsigned long func,
struct pt_regs *regs);
-   bool (*filter)(struct uprobe_consumer *self,
-   enum uprobe_filter_ctx ctx,
-   struct mm_struct *mm);
+   bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
struct uprobe_consumer *next;
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6d5c3f4b210f..71a8886608b1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -913,21 +913,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct 
file *file,
return ret;
 }
 
-static inline bool consumer_filter(struct uprobe_consumer *uc,
-  enum uprobe_filter_ctx ctx, struct mm_struct 
*mm)
+static inline bool consumer_filter(struct uprobe_consumer *uc, struct 
mm_struct *mm)
 {
-   return !uc->filter || uc->filter(uc, ctx, mm);
+   return !uc->filter || uc->filter(uc, mm);
 }
 
-static bool filter_chain(struct uprobe *uprobe,
-enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
 {
struct uprobe_consumer *uc;
bool ret = false;
 
down_read(&uprobe->consumer_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
-   ret = consumer_filter(uc, ctx, mm);
+   ret = consumer_filter(uc, mm);
if (ret)
break;
}
@@ -1094,12 +1092,10 @@ register_for_each_vma(struct uprobe *uprobe, struct 
uprobe_consumer *new)
 
if (is_register) {
/* consult only the "caller", new consumer. */
-   if (consumer_filter(new,
-   UPROBE_FILTER_REGISTER, mm))
+   if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, mm, vma, 
info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
-   if (!filter_chain(uprobe,
-   UPROBE_FILTER_UNREGISTER, mm))
+   if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, mm, 
info->vaddr);
}
 
@@ -1383,7 +1379,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 */
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
-   filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+   filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, 
uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4e391daafa64..73c570b5988b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3320,8 +3320,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
 }
 
 static bool
-uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx 
ctx,
-struct mm_struct *mm)
+uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
 {
struct bpf_uprobe *uprobe;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 52e76a73fa7c..7eb79e0a5352 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1078,9 +1078,7 @@ print_uprobe_event(struct trace_iterator *iter, int 
flags, struct trace_event *e
return trace_handle_return(s);
 }
 
-typedef bool (*filter_func_t)(struct uprobe_consumer *self,
-   enum uprobe_filter_ctx ctx,
-   struct mm_struct *mm);
+typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct 
*mm);
 
 static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
 {
@

[PATCH 5/8] uprobes: travers uprobe's consumer list locklessly under SRCU protection

2024-07-31 Thread Andrii Nakryiko

uprobe->register_rwsem is one of a few big bottlenecks to scalability of
uprobes, so we need to get rid of it to improve uprobe performance and
multi-CPU scalability.

First, we turn uprobe's consumer list to a typical doubly-linked list
and utilize existing RCU-aware helpers for traversing such lists, as
well as adding and removing elements from it.

For entry uprobes we already have SRCU protection active since before
uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
won't go away from under us, but we add SRCU protection around consumer
list traversal.

Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
we remember whether any removal was requested during handler calls, but
then we double-check the decision under a proper register_rwsem using
consumers' filter callbacks. Handler removal is very rare, so this extra
lock won't hurt performance, overall, but we also avoid the need for any
extra protection (e.g., seqcount locks).

Signed-off-by: Andrii Nakryiko 
---
 include/linux/uprobes.h |  2 +-
 kernel/events/uprobes.c | 97 -
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 8d5bbad2048c..a1686c1ebcb6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -35,7 +35,7 @@ struct uprobe_consumer {
struct pt_regs *regs);
bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
-   struct uprobe_consumer *next;
+   struct list_head cons_node;
 };
 
 #ifdef CONFIG_UPROBES
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 71a8886608b1..3b42fd355256 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -59,7 +59,7 @@ struct uprobe {
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_headpending_list;
-   struct uprobe_consumer  *consumers;
+   struct list_headconsumers;
struct inode*inode; /* Also hold a ref to inode */
struct rcu_head rcu;
loff_t  offset;
@@ -778,6 +778,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, 
loff_t offset,
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
+   INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
RB_CLEAR_NODE(&uprobe->rb_node);
@@ -803,34 +804,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, 
loff_t offset,
 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
down_write(&uprobe->consumer_rwsem);
-   uc->next = uprobe->consumers;
-   uprobe->consumers = uc;
+   list_add_rcu(&uc->cons_node, &uprobe->consumers);
up_write(&uprobe->consumer_rwsem);
 }
 
-/*
- * For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
- */
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-   struct uprobe_consumer **con;
-   bool ret = false;
-
-   down_write(&uprobe->consumer_rwsem);
-   for (con = &uprobe->consumers; *con; con = &(*con)->next) {
-   if (*con == uc) {
-   *con = uc->next;
-   ret = true;
-   break;
-   }
-   }
-   up_write(&uprobe->consumer_rwsem);
-
-   return ret;
-}
-
 static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
 {
@@ -924,7 +901,8 @@ static bool filter_chain(struct uprobe *uprobe, struct 
mm_struct *mm)
bool ret = false;
 
down_read(&uprobe->consumer_rwsem);
-   for (uc = uprobe->consumers; uc; uc = uc->next) {
+   list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+srcu_read_lock_held(&uprobes_srcu)) {
ret = consumer_filter(uc, mm);
if (ret)
break;
@@ -1120,17 +1098,19 @@ void uprobe_unregister(struct uprobe *uprobe, struct 
uprobe_consumer *uc)
int err;
 
down_write(&uprobe->register_rwsem);
-   if (WARN_ON(!consumer_del(uprobe, uc))) {
-   err = -ENOENT;
-   } else {
-   err = register_for_each_vma(uprobe, NULL);
-   /* TODO : cant unregister? schedule a worker thread */
-   WARN(err, "leaking uprobe due to failed unregistration");
-   }
+
+   list_del_rcu(&uc->cons_node);
+   err = register_for_each_vma(uprobe, NULL);
+
up_write(&uprobe->register_rwsem);
 
-   if (!err)
-   put_uprobe(uprobe);
+   /* TODO : cant unregister? schedule a worker thread */
+   if (WARN(err, "leaking uprobe due to failed unre

[PATCH 6/8] perf/uprobe: split uprobe_unregister()

2024-07-31 Thread Andrii Nakryiko

From: Peter Zijlstra 

With uprobe_unregister() having grown a synchronize_srcu(), it becomes
fairly slow to call. Esp. since both users of this API call it in a
loop.

Peel off the sync_srcu() and do it once, after the loop.

With recent uprobe_register()'s error handling reusing full
uprobe_unregister() call, we need to be careful about returning to the
caller before we have a guarantee that partially attached consumer won't
be called anymore. So add uprobe_unregister_sync() in the error handling
path. This is an unlikely slow path and this should be totally fine to
be slow in the case of an failed attach.

Signed-off-by: Peter Zijlstra (Intel) 
Co-developed-by: Andrii Nakryiko 
Signed-off-by: Andrii Nakryiko 
---
 include/linux/uprobes.h|  8 ++--
 kernel/events/uprobes.c| 18 ++
 kernel/trace/bpf_trace.c   |  5 -
 kernel/trace/trace_uprobe.c|  6 +-
 .../selftests/bpf/bpf_testmod/bpf_testmod.c|  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index a1686c1ebcb6..8f1999eb9d9f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -105,7 +105,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs 
*regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
*mm, unsigned long vaddr, uprobe_opcode_t);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, 
loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, 
bool);
-extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer 
*uc);
+extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct 
uprobe_consumer *uc);
+extern void uprobe_unregister_sync(void);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, 
unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -154,7 +155,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer 
*uc, bool add)
return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+}
+static inline void uprobes_unregister_sync(void)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3b42fd355256..b0488d356399 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1089,11 +1089,11 @@ register_for_each_vma(struct uprobe *uprobe, struct 
uprobe_consumer *new)
 }
 
 /**
- * uprobe_unregister - unregister an already registered probe.
+ * uprobe_unregister_nosync - unregister an already registered probe.
  * @uprobe: uprobe to remove
  * @uc: identify which probe if multiple probes are colocated.
  */
-void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer 
*uc)
 {
int err;
 
@@ -1109,10 +1109,14 @@ void uprobe_unregister(struct uprobe *uprobe, struct 
uprobe_consumer *uc)
return;
 
put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
 
+void uprobe_unregister_sync(void)
+{
synchronize_srcu(&uprobes_srcu);
 }
-EXPORT_SYMBOL_GPL(uprobe_unregister);
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
 
 /**
  * uprobe_register - register a probe
@@ -1170,7 +1174,13 @@ struct uprobe *uprobe_register(struct inode *inode,
up_write(&uprobe->register_rwsem);
 
if (ret) {
-   uprobe_unregister(uprobe, uc);
+   uprobe_unregister_nosync(uprobe, uc);
+   /*
+* Registration might have partially succeeded, so we can have
+* this consumer being called right at this time. We need to
+* sync here. It's ok, it's unlikely slow path.
+*/
+   uprobe_unregister_sync();
return ERR_PTR(ret);
}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 73c570b5988b..6b632710c98e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3184,7 +3184,10 @@ static void bpf_uprobe_unregister(struct bpf_uprobe 
*uprobes, u32 cnt)
u32 i;
 
for (i = 0; i < cnt; i++)
-   uprobe_unregister(uprobes[i].uprobe, &uprobes[i].consumer);
+   uprobe_unregister_nosync(uprobes[i].uprobe, 
&uprobes[i].consumer);
+
+   if (cnt)
+   uprobe_unregister_sync();
 }
 
 static void bpf_uprobe_multi_link_release(struct bpf_link *link)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7eb79e0a5352..f7443e996b1b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1097,6 +1097,7 @@ static in

[PATCH 7/8] uprobes: perform lockless SRCU-protected uprobes_tree lookup

2024-07-31 Thread Andrii Nakryiko

Another big bottleneck to scalablity is uprobe_treelock that's taken in
a very hot path in handle_swbp(). Now that uprobes are SRCU-protected,
take advantage of that and make uprobes_tree RB-tree look up lockless.

To make RB-tree RCU-protected lockless lookup correct, we need to take
into account that such RB-tree lookup can return false negatives if there
are parallel RB-tree modifications (rotations) going on. We use seqcount
lock to detect whether RB-tree changed, and if we find nothing while
RB-tree got modified inbetween, we just retry. If uprobe was found, then
it's guaranteed to be a correct lookup.

With all the lock-avoiding changes done, we get a pretty decent
improvement in performance and scalability of uprobes with number of
CPUs, even though we are still nowhere near linear scalability. This is
due to SRCU not really scaling very well with number of CPUs on
a particular hardware that was used for testing (80-core Intel Xeon Gold
6138 CPU @ 2.00GHz), but also due to the remaning mmap_lock, which is
currently taken to resolve interrupt address to inode+offset and then
uprobe instance. And, of course, uretprobes still need similar RCU to
avoid refcount in the hot path, which will be addressed in the follow up
patches.

Nevertheless, the improvement is good. We used BPF selftest-based
uprobe-nop and uretprobe-nop benchmarks to get the below numbers,
varying number of CPUs on which uprobes and uretprobes are triggered.

BASELINE

uprobe-nop  ( 1 cpus):3.032 ± 0.023M/s  (  3.032M/s/cpu)
uprobe-nop  ( 2 cpus):3.452 ± 0.005M/s  (  1.726M/s/cpu)
uprobe-nop  ( 4 cpus):3.663 ± 0.005M/s  (  0.916M/s/cpu)
uprobe-nop  ( 8 cpus):3.718 ± 0.038M/s  (  0.465M/s/cpu)
uprobe-nop  (16 cpus):3.344 ± 0.008M/s  (  0.209M/s/cpu)
uprobe-nop  (32 cpus):2.288 ± 0.021M/s  (  0.071M/s/cpu)
uprobe-nop  (64 cpus):3.205 ± 0.004M/s  (  0.050M/s/cpu)

uretprobe-nop   ( 1 cpus):1.979 ± 0.005M/s  (  1.979M/s/cpu)
uretprobe-nop   ( 2 cpus):2.361 ± 0.005M/s  (  1.180M/s/cpu)
uretprobe-nop   ( 4 cpus):2.309 ± 0.002M/s  (  0.577M/s/cpu)
uretprobe-nop   ( 8 cpus):2.253 ± 0.001M/s  (  0.282M/s/cpu)
uretprobe-nop   (16 cpus):2.007 ± 0.000M/s  (  0.125M/s/cpu)
uretprobe-nop   (32 cpus):1.624 ± 0.003M/s  (  0.051M/s/cpu)
uretprobe-nop   (64 cpus):2.149 ± 0.001M/s  (  0.034M/s/cpu)

SRCU CHANGES

uprobe-nop  ( 1 cpus):3.276 ± 0.005M/s  (  3.276M/s/cpu)
uprobe-nop  ( 2 cpus):4.125 ± 0.002M/s  (  2.063M/s/cpu)
uprobe-nop  ( 4 cpus):7.713 ± 0.002M/s  (  1.928M/s/cpu)
uprobe-nop  ( 8 cpus):8.097 ± 0.006M/s  (  1.012M/s/cpu)
uprobe-nop  (16 cpus):6.501 ± 0.056M/s  (  0.406M/s/cpu)
uprobe-nop  (32 cpus):4.398 ± 0.084M/s  (  0.137M/s/cpu)
uprobe-nop  (64 cpus):6.452 ± 0.000M/s  (  0.101M/s/cpu)

uretprobe-nop   ( 1 cpus):2.055 ± 0.001M/s  (  2.055M/s/cpu)
uretprobe-nop   ( 2 cpus):2.677 ± 0.000M/s  (  1.339M/s/cpu)
uretprobe-nop   ( 4 cpus):4.561 ± 0.003M/s  (  1.140M/s/cpu)
uretprobe-nop   ( 8 cpus):5.291 ± 0.002M/s  (  0.661M/s/cpu)
uretprobe-nop   (16 cpus):5.065 ± 0.019M/s  (  0.317M/s/cpu)
uretprobe-nop   (32 cpus):3.622 ± 0.003M/s  (  0.113M/s/cpu)
uretprobe-nop   (64 cpus):3.723 ± 0.002M/s  (  0.058M/s/cpu)

Peak througput increased from 3.7 mln/s (uprobe triggerings) up to about
8 mln/s. For uretprobes it's a bit more modest with bump from 2.4 mln/s
to 5mln/s.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Andrii Nakryiko 
---
 kernel/events/uprobes.c | 30 --
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b0488d356399..d03962cc96de 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -40,6 +40,7 @@ static struct rb_root uprobes_tree = RB_ROOT;
 #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
 
 static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = 
SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
 
 DEFINE_STATIC_SRCU(uprobes_srcu);
 
@@ -629,8 +630,11 @@ static void put_uprobe(struct uprobe *uprobe)
 
write_lock(&uprobes_treelock);
 
-   if (uprobe_is_active(uprobe))
+   if (uprobe_is_active(uprobe)) {
+   write_seqcount_begin(&uprobes_seqcount);
rb_erase(&uprobe->rb_node, &uprobes_tree);
+   write_seqcount_end(&uprobes_seqcount);
+   }
 
write_unlock(&uprobes_treelock);
 
@@ -696,14 +700,26 @@ static struct uprobe *find_uprobe_rcu(struct inode 
*inode, loff_t offset)
.offset = offset,
};
struct rb_node *node;
+   unsigned int seq;
 
lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
 
-   read_lock(&uprobes_treelock);
-   node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
-   read_unlock(&uprobes_treelock);
+

[PATCH 8/8] uprobes: switch to RCU Tasks Trace flavor for better performance

2024-07-31 Thread Andrii Nakryiko

This patch switches uprobes SRCU usage to RCU Tasks Trace flavor, which
is optimized for more lightweight and quick readers (at the expense of
slower writers, which for uprobes is a fine tradeof) and has better
performance and scalability with number of CPUs.

Similarly to baseline vs SRCU, we've benchmarked SRCU-based
implementation vs RCU Tasks Trace implementation.

SRCU

uprobe-nop  ( 1 cpus):3.276 ± 0.005M/s  (  3.276M/s/cpu)
uprobe-nop  ( 2 cpus):4.125 ± 0.002M/s  (  2.063M/s/cpu)
uprobe-nop  ( 4 cpus):7.713 ± 0.002M/s  (  1.928M/s/cpu)
uprobe-nop  ( 8 cpus):8.097 ± 0.006M/s  (  1.012M/s/cpu)
uprobe-nop  (16 cpus):6.501 ± 0.056M/s  (  0.406M/s/cpu)
uprobe-nop  (32 cpus):4.398 ± 0.084M/s  (  0.137M/s/cpu)
uprobe-nop  (64 cpus):6.452 ± 0.000M/s  (  0.101M/s/cpu)

uretprobe-nop   ( 1 cpus):2.055 ± 0.001M/s  (  2.055M/s/cpu)
uretprobe-nop   ( 2 cpus):2.677 ± 0.000M/s  (  1.339M/s/cpu)
uretprobe-nop   ( 4 cpus):4.561 ± 0.003M/s  (  1.140M/s/cpu)
uretprobe-nop   ( 8 cpus):5.291 ± 0.002M/s  (  0.661M/s/cpu)
uretprobe-nop   (16 cpus):5.065 ± 0.019M/s  (  0.317M/s/cpu)
uretprobe-nop   (32 cpus):3.622 ± 0.003M/s  (  0.113M/s/cpu)
uretprobe-nop   (64 cpus):3.723 ± 0.002M/s  (  0.058M/s/cpu)

RCU Tasks Trace
===
uprobe-nop  ( 1 cpus):3.396 ± 0.002M/s  (  3.396M/s/cpu)
uprobe-nop  ( 2 cpus):4.271 ± 0.006M/s  (  2.135M/s/cpu)
uprobe-nop  ( 4 cpus):8.499 ± 0.015M/s  (  2.125M/s/cpu)
uprobe-nop  ( 8 cpus):   10.355 ± 0.028M/s  (  1.294M/s/cpu)
uprobe-nop  (16 cpus):7.615 ± 0.099M/s  (  0.476M/s/cpu)
uprobe-nop  (32 cpus):4.430 ± 0.007M/s  (  0.138M/s/cpu)
uprobe-nop  (64 cpus):6.887 ± 0.020M/s  (  0.108M/s/cpu)

uretprobe-nop   ( 1 cpus):2.174 ± 0.001M/s  (  2.174M/s/cpu)
uretprobe-nop   ( 2 cpus):2.853 ± 0.001M/s  (  1.426M/s/cpu)
uretprobe-nop   ( 4 cpus):4.913 ± 0.002M/s  (  1.228M/s/cpu)
uretprobe-nop   ( 8 cpus):5.883 ± 0.002M/s  (  0.735M/s/cpu)
uretprobe-nop   (16 cpus):5.147 ± 0.001M/s  (  0.322M/s/cpu)
uretprobe-nop   (32 cpus):3.738 ± 0.008M/s  (  0.117M/s/cpu)
uretprobe-nop   (64 cpus):4.397 ± 0.002M/s  (  0.069M/s/cpu)

Peak throughput for uprobes increases from 8 mln/s to 10.3 mln/s
(+28%!), and for uretprobes from 5.3 mln/s to 5.8 mln/s (+11%), as we
have more work to do on uretprobes side.

Even single-thread (no contention) performance is slightly better: 3.276
mln/s to 3.396 mln/s (+3.5%) for uprobes, and 2.055 mln/s to 2.174 mln/s
(+5.8%) for uretprobes.

Signed-off-by: Andrii Nakryiko 
---
 kernel/events/uprobes.c | 40 +---
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d03962cc96de..ef915f87d27f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -42,8 +42,6 @@ static struct rb_root uprobes_tree = RB_ROOT;
 static DEFINE_RWLOCK(uprobes_treelock);/* serialize rbtree access */
 static seqcount_rwlock_t uprobes_seqcount = 
SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
 
-DEFINE_STATIC_SRCU(uprobes_srcu);
-
 #define UPROBES_HASH_SZ13
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
@@ -647,7 +645,7 @@ static void put_uprobe(struct uprobe *uprobe)
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
 
-   call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
+   call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
 }
 
 static __always_inline
@@ -702,7 +700,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, 
loff_t offset)
struct rb_node *node;
unsigned int seq;
 
-   lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
+   lockdep_assert(rcu_read_lock_trace_held());
 
do {
seq = read_seqcount_begin(&uprobes_seqcount);
@@ -919,8 +917,7 @@ static bool filter_chain(struct uprobe *uprobe, struct 
mm_struct *mm)
bool ret = false;
 
down_read(&uprobe->consumer_rwsem);
-   list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
-srcu_read_lock_held(&uprobes_srcu)) {
+   list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, 
rcu_read_lock_trace_held()) {
ret = consumer_filter(uc, mm);
if (ret)
break;
@@ -1132,7 +1129,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
 
 void uprobe_unregister_sync(void)
 {
-   synchronize_srcu(&uprobes_srcu);
+   synchronize_rcu_tasks_trace();
 }
 EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
 
@@ -1216,19 +1213,18 @@ EXPORT_SYMBOL_GPL(uprobe_register);
 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
 {
struct uprobe_consumer *con;
-   int ret = -ENOENT, srcu_idx;
+   int ret = -ENOENT;
 
down_write(&uprobe->register_r

Re: [PATCH v2 2/5] dt-bindings: soc: qcom: smd-rpm: add generic compatibles

2024-07-31 Thread Stephen Boyd

Quoting Dmitry Baryshkov (2024-07-29 12:52:15)
> Add two generic compatibles to all smd-rpm devices, they follow the same
> RPMSG protocol and are either accessed through the smd-edge or through
> the glink-edge.
> 
> Signed-off-by: Dmitry Baryshkov 
> ---

Acked-by: Stephen Boyd

[PATCH AUTOSEL 6.10 057/121] remoteproc: mediatek: Zero out only remaining bytes of IPI buffer

2024-07-31 Thread Sasha Levin

From: AngeloGioacchino Del Regno 

[ Upstream commit 9dbd9962cfe56d210be5232349851420b5f9c8f6 ]

In scp_ipi_handler(), instead of zeroing out the entire shared
buffer, which may be as large as 600 bytes, overwrite it with the
received data, then zero out only the remaining bytes.

Signed-off-by: AngeloGioacchino Del Regno 

Link: 
https://lore.kernel.org/r/20240520112724.139945-1-angelogioacchino.delre...@collabora.com
Signed-off-by: Mathieu Poirier 
Signed-off-by: Sasha Levin 
---
 drivers/remoteproc/mtk_scp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c
index b8498772dba17..b885a9a041e48 100644
--- a/drivers/remoteproc/mtk_scp.c
+++ b/drivers/remoteproc/mtk_scp.c
@@ -117,8 +117,8 @@ static void scp_ipi_handler(struct mtk_scp *scp)
return;
}
 
-   memset(scp->share_buf, 0, scp_sizes->ipi_share_buffer_size);
memcpy_fromio(scp->share_buf, &rcv_obj->share_buf, len);
+   memset(&scp->share_buf[len], 0, scp_sizes->ipi_share_buffer_size - len);
handler(scp->share_buf, len, ipi_desc[id].priv);
scp_ipi_unlock(scp, id);
 
-- 
2.43.0

[PATCH AUTOSEL 6.10 061/121] remoteproc: qcom_q6v5_pas: Add hwspinlock bust on stop

2024-07-31 Thread Sasha Levin

From: Richard Maina 

[ Upstream commit 568b13b65078e2b557ccf47674a354cecd1db641 ]

When remoteproc goes down unexpectedly this results in a state where any
acquired hwspinlocks will remain locked possibly resulting in deadlock.
In order to ensure all locks are freed we include a call to
qcom_smem_bust_hwspin_lock_by_host() during remoteproc shutdown.

For qcom_q6v5_pas remoteprocs, each remoteproc has an assigned smem
host_id. Remoteproc can pass this id to smem to try and bust the lock on
remoteproc stop.

This edge case only occurs with q6v5_pas watchdog crashes. The error
fatal case has handling to clear the hwspinlock before the error fatal
interrupt is triggered.

Signed-off-by: Richard Maina 
Reviewed-by: Bjorn Andersson 
Signed-off-by: Chris Lew 
Link: 
https://lore.kernel.org/r/20240529-hwspinlock-bust-v3-4-c8b924ffa...@quicinc.com
Signed-off-by: Bjorn Andersson 
Signed-off-by: Sasha Levin 
---
 drivers/remoteproc/qcom_q6v5_pas.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/remoteproc/qcom_q6v5_pas.c 
b/drivers/remoteproc/qcom_q6v5_pas.c
index 54d8005d40a34..8458bcfe9e19e 100644
--- a/drivers/remoteproc/qcom_q6v5_pas.c
+++ b/drivers/remoteproc/qcom_q6v5_pas.c
@@ -52,6 +52,7 @@ struct adsp_data {
const char *ssr_name;
const char *sysmon_name;
int ssctl_id;
+   unsigned int smem_host_id;
 
int region_assign_idx;
int region_assign_count;
@@ -81,6 +82,7 @@ struct qcom_adsp {
int lite_pas_id;
unsigned int minidump_id;
int crash_reason_smem;
+   unsigned int smem_host_id;
bool decrypt_shutdown;
const char *info_name;
 
@@ -399,6 +401,9 @@ static int adsp_stop(struct rproc *rproc)
if (handover)
qcom_pas_handover(&adsp->q6v5);
 
+   if (adsp->smem_host_id)
+   ret = qcom_smem_bust_hwspin_lock_by_host(adsp->smem_host_id);
+
return ret;
 }
 
@@ -727,6 +732,7 @@ static int adsp_probe(struct platform_device *pdev)
adsp->pas_id = desc->pas_id;
adsp->lite_pas_id = desc->lite_pas_id;
adsp->info_name = desc->sysmon_name;
+   adsp->smem_host_id = desc->smem_host_id;
adsp->decrypt_shutdown = desc->decrypt_shutdown;
adsp->region_assign_idx = desc->region_assign_idx;
adsp->region_assign_count = min_t(int, MAX_ASSIGN_COUNT, 
desc->region_assign_count);
@@ -1196,6 +1202,7 @@ static const struct adsp_data sm8550_adsp_resource = {
.ssr_name = "lpass",
.sysmon_name = "adsp",
.ssctl_id = 0x14,
+   .smem_host_id = 2,
 };
 
 static const struct adsp_data sm8550_cdsp_resource = {
@@ -1216,6 +1223,7 @@ static const struct adsp_data sm8550_cdsp_resource = {
.ssr_name = "cdsp",
.sysmon_name = "cdsp",
.ssctl_id = 0x17,
+   .smem_host_id = 5,
 };
 
 static const struct adsp_data sm8550_mpss_resource = {
@@ -1236,6 +1244,7 @@ static const struct adsp_data sm8550_mpss_resource = {
.ssr_name = "mpss",
.sysmon_name = "modem",
.ssctl_id = 0x12,
+   .smem_host_id = 1,
.region_assign_idx = 2,
.region_assign_count = 1,
.region_assign_vmid = QCOM_SCM_VMID_MSS_MSA,
@@ -1275,6 +1284,7 @@ static const struct adsp_data sm8650_cdsp_resource = {
.ssr_name = "cdsp",
.sysmon_name = "cdsp",
.ssctl_id = 0x17,
+   .smem_host_id = 5,
.region_assign_idx = 2,
.region_assign_count = 1,
.region_assign_shared = true,
@@ -1299,6 +1309,7 @@ static const struct adsp_data sm8650_mpss_resource = {
.ssr_name = "mpss",
.sysmon_name = "modem",
.ssctl_id = 0x12,
+   .smem_host_id = 1,
.region_assign_idx = 2,
.region_assign_count = 3,
.region_assign_vmid = QCOM_SCM_VMID_MSS_MSA,
-- 
2.43.0

[PATCH AUTOSEL 6.10 111/121] virtio_ring: fix KMSAN error for premapped mode

2024-07-31 Thread Sasha Levin

From: Xuan Zhuo 

[ Upstream commit 840b2d39a2dc1b96deb3f5c7fef76c9b24f08f51 ]

Add kmsan for virtqueue_dma_map_single_attrs to fix:

BUG: KMSAN: uninit-value in receive_buf+0x45ca/0x6990
 receive_buf+0x45ca/0x6990
 virtnet_poll+0x17e0/0x3130
 net_rx_action+0x832/0x26e0
 handle_softirqs+0x330/0x10f0
 [...]

Uninit was created at:
 __alloc_pages_noprof+0x62a/0xe60
 alloc_pages_noprof+0x392/0x830
 skb_page_frag_refill+0x21a/0x5c0
 virtnet_rq_alloc+0x50/0x1500
 try_fill_recv+0x372/0x54c0
 virtnet_open+0x210/0xbe0
 __dev_open+0x56e/0x920
 __dev_change_flags+0x39c/0x2000
 dev_change_flags+0xaa/0x200
 do_setlink+0x197a/0x7420
 rtnl_setlink+0x77c/0x860
 [...]

Signed-off-by: Xuan Zhuo 
Tested-by: Alexander Potapenko 
Message-Id: <20240606111345.93600-1-xuanz...@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin 
Tested-by: Ilya Leoshkevich   # s390x
Acked-by: Jason Wang 
Signed-off-by: Sasha Levin 
---
 drivers/virtio/virtio_ring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 2a972752ff1bc..9d3a9942c8c82 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -3121,8 +3121,10 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct 
virtqueue *_vq, void *ptr,
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
-   if (!vq->use_dma_api)
+   if (!vq->use_dma_api) {
+   kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, 
dir);
return (dma_addr_t)virt_to_phys(ptr);
+   }
 
return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs);
 }
-- 
2.43.0

[PATCH AUTOSEL 6.6 75/83] virtio_ring: fix KMSAN error for premapped mode

2024-07-31 Thread Sasha Levin

From: Xuan Zhuo 

[ Upstream commit 840b2d39a2dc1b96deb3f5c7fef76c9b24f08f51 ]

Add kmsan for virtqueue_dma_map_single_attrs to fix:

BUG: KMSAN: uninit-value in receive_buf+0x45ca/0x6990
 receive_buf+0x45ca/0x6990
 virtnet_poll+0x17e0/0x3130
 net_rx_action+0x832/0x26e0
 handle_softirqs+0x330/0x10f0
 [...]

Uninit was created at:
 __alloc_pages_noprof+0x62a/0xe60
 alloc_pages_noprof+0x392/0x830
 skb_page_frag_refill+0x21a/0x5c0
 virtnet_rq_alloc+0x50/0x1500
 try_fill_recv+0x372/0x54c0
 virtnet_open+0x210/0xbe0
 __dev_open+0x56e/0x920
 __dev_change_flags+0x39c/0x2000
 dev_change_flags+0xaa/0x200
 do_setlink+0x197a/0x7420
 rtnl_setlink+0x77c/0x860
 [...]

Signed-off-by: Xuan Zhuo 
Tested-by: Alexander Potapenko 
Message-Id: <20240606111345.93600-1-xuanz...@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin 
Tested-by: Ilya Leoshkevich   # s390x
Acked-by: Jason Wang 
Signed-off-by: Sasha Levin 
---
 drivers/virtio/virtio_ring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6f7e5010a6735..80669e05bf0ee 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -3126,8 +3126,10 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct 
virtqueue *_vq, void *ptr,
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
-   if (!vq->use_dma_api)
+   if (!vq->use_dma_api) {
+   kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, 
dir);
return (dma_addr_t)virt_to_phys(ptr);
+   }
 
return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs);
 }
-- 
2.43.0

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Jason Wang

On Thu, Aug 1, 2024 at 5:26 AM Michael S. Tsirkin  wrote:
>
> On Wed, Jul 31, 2024 at 10:59:47AM +0800, Jason Wang wrote:
> > This patch synchronize operstate with admin state per RFC2863.
> >
> > This is done by trying to toggle the carrier upon open/close and
> > synchronize with the config change work. This allows propagate status
> > correctly to stacked devices like:
> >
> > ip link add link enp0s3 macvlan0 type macvlan
> > ip link set link enp0s3 down
> > ip link show
> >
> > Before this patch:
> >
> > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> > DEFAULT group default qlen 1000
> > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > ..
> > 5: macvlan0@enp0s3:  mtu 1500 qdisc 
> > noqueue state UP mode DEFAULT group default qlen 1000
> > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> >
> > After this patch:
> >
> > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> > DEFAULT group default qlen 1000
> > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > ...
> > 5: macvlan0@enp0s3:  mtu 1500 
> > qdisc noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
> > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> >
> > Cc: Venkat Venkatsubra 
> > Cc: Gia-Khanh Nguyen 
> > Signed-off-by: Jason Wang 
>
> Changelog?

In the cover letter actually.

>
> > ---
> >  drivers/net/virtio_net.c | 84 ++--
> >  1 file changed, 54 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 0383a3e136d6..0cb93261eba1 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -2878,6 +2878,7 @@ static int virtnet_enable_queue_pair(struct 
> > virtnet_info *vi, int qp_index)
> >   return err;
> >  }
> >
> > +
> >  static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim)
> >  {
> >   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
>
> hmm
>
> > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct virtnet_info 
> > *vi, struct dim *dim)
> >   net_dim_work_cancel(dim);
> >  }
> >
> > +static void virtnet_update_settings(struct virtnet_info *vi)
> > +{
> > + u32 speed;
> > + u8 duplex;
> > +
> > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > + return;
> > +
> > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> > +
> > + if (ethtool_validate_speed(speed))
> > + vi->speed = speed;
> > +
> > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> > +
> > + if (ethtool_validate_duplex(duplex))
> > + vi->duplex = duplex;
> > +}
> > +
>
> I already commented on this approach.  This is now invoked on each open,
> lots of extra VM exits. No bueno, people are working hard to keep setup
> overhead under control. Handle this in the config change interrupt -
> your new infrastructure is perfect for this.

No, in this version it doesn't. Config space read only happens if
there's a pending config interrupt during ndo_open:

+   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
+   if (vi->status & VIRTIO_NET_S_LINK_UP)
+   netif_carrier_on(vi->dev);
+   virtio_config_driver_enable(vi->vdev);
+   } else {
+   vi->status = VIRTIO_NET_S_LINK_UP;
+   netif_carrier_on(dev);
+   virtnet_update_settings(vi);
+   }

>
>
> >  static int virtnet_open(struct net_device *dev)
> >  {
> >   struct virtnet_info *vi = netdev_priv(dev);
> > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> >   goto err_enable_qp;
> >   }
> >
> > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > + netif_carrier_on(vi->dev);
> > + virtio_config_driver_enable(vi->vdev);
> > + } else {
> > + vi->status = VIRTIO_NET_S_LINK_UP;
> > + netif_carrier_on(dev);
> > + virtnet_update_settings(vi);
> > + }
> > +
> >   return 0;
> >
> >  err_enable_qp:
> > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
> >   disable_delayed_refill(vi);
> >   /* Make sure refill_work doesn't re-enable napi! */
> >   cancel_delayed_work_sync(&vi->refill);
> > + /* Make sure config notification doesn't schedule config work */
>
> it's clear what this does even without a comment.
> what you should comment on, and do not, is *why*.

Well, it just follows the existing style, for example the above said

"/* Make sure refill_work doesn't re-enable napi! */"

>
> > + virtio_config_driver_disable(vi->vdev);
> > + /* Make sure status updating is cancelled */
>
> same
>
> also what "status updating"? confuses more than this clarifies.

Does "Make sure the config changed work is cancelled" sounds better?

>
> > + canc

RE: [PATCH] remoteproc: Use of_property_present()

2024-07-31 Thread Peng Fan

> Subject: [PATCH] remoteproc: Use of_property_present()
> 
> Use of_property_present() to test for property presence rather than
> of_(find|get)_property(). This is part of a larger effort to remove callers
> of of_find_property() and similar functions. of_find_property() leaks the
> DT struct property and data pointers which is a problem for
> dynamically allocated nodes which may be freed.
> 
> Signed-off-by: Rob Herring (Arm) 
> ---
>  drivers/remoteproc/imx_dsp_rproc.c  | 2 +-
>  drivers/remoteproc/imx_rproc.c  | 2 +-
>  drivers/remoteproc/xlnx_r5_remoteproc.c | 6 +++---
>  3 files changed, 5 insertions(+), 5 deletions(-)

For i.MX:

Acked-by: Peng Fan

Re: [PATCH v3 00/11] F(x)tec Pro1X feature expansion

2024-07-31 Thread Bjorn Andersson



On Wed, 31 Jul 2024 13:18:41 +0700, Dang Huynh wrote:
> This patch series expand F(x)tec Pro1X (QX1050) device tree to support
> various components of the device.
> 
> Most notably:
> + SD Card slot
> + Touchscreen
> + MDSS, DRM display panel
> + WLAN (ATH10K)
> + Hall sensor and camera button
> 
> [...]

Applied, thanks!

[01/11] arm64: dts: qcom: sm6115-pro1x: Add Hall Switch and Camera Button
commit: ff5affd17bde4ea78d153122a601d69b3f302326
[02/11] arm64: dts: qcom: sm6115-pro1x: Add PCA9534 IO Expander
commit: 4686161eb87168ec746eb54d7b84c5d022073a33
[03/11] arm64: dts: qcom: sm6115-pro1x: Add Goodix Touchscreen
commit: e46b455e67f836361a94512ca187442a8b699f25
[04/11] arm64: dts: qcom: sm6115-pro1x: Add Caps Lock LED
commit: 17c98581155e88d3f118cd879ba263e952b83946
[05/11] arm64: dts: qcom: sm6115-pro1x: Enable SD card slot
commit: 95b19afd734d0a278088456b052a2fb94c4ade55
[06/11] arm64: dts: qcom: sm6115-pro1x: Enable MDSS and GPU
commit: 8b9f76a6f8fbc81fdc44b5c4b134d20095c38a6a
[07/11] arm64: dts: qcom: sm6115-pro1x: Hook up USB3 SS
commit: e0674d85c80456782fdc44c36c4884fa64bb3a58
[08/11] arm64: dts: qcom: sm6115-pro1x: Add PMI632 Type-C property
commit: 79f8d127c46a1311de49db7c175fee84ce827d3a
[09/11] arm64: dts: qcom: sm6115-pro1x: Enable RGB LED
commit: b5c63330a7ef026c21da5eed4669a790b22ea642
[10/11] arm64: dts: qcom: sm6115-pro1x: Enable remoteprocs
commit: e055924159df6ec2ffa0f221aa84c49429cfe6db
[11/11] arm64: dts: qcom: sm6115-pro1x: Enable ATH10K WLAN
commit: 84c1711f27509a6a5841b13ac08fc58b1d091ae8

Best regards,
-- 
Bjorn Andersson

Re: [PATCH] arm64: dts: qcom: msm8916-samsung-fortuna: Add touch keys

2024-07-31 Thread Bjorn Andersson



On Wed, 24 Jul 2024 14:32:51 +, Raymond Hackley wrote:
> Touch keys feature on fortuna phones are provided by Zinitix touchscreen.
> Add property linux,keycodes to enable touch keys.
> 
> 

Applied, thanks!

[1/1] arm64: dts: qcom: msm8916-samsung-fortuna: Add touch keys
  commit: ccf683fa0c9b5c53534030ddc9dd8a8603f715a0

Best regards,
-- 
Bjorn Andersson

[PATCH v4 0/4] Add SBAF test to IFS

2024-07-31 Thread Kuppuswamy Sathyanarayanan

This patch series adds support for Structural Based Functional Test at
Field (SBAF) in the IFS driver. SBAF is a new type of testing that
provides comprehensive core test coverage, complementing existing IFS
tests like Scan at Field (SAF) and ArrayBist. Granite Rapids (GNR) is
the first platform that supports SBAF.

SBAF mimics the manufacturing screening environment and leverages the
same test suite. It makes use of Design For Test (DFT) observation
sites and features to maximize coverage in minimum time.

Similar to the SAF test, SBAF isolates the core under test from the
rest of the system during execution. Upon completion, the core
seamlessly resets to its pre-test state and resumes normal operation.
Any machine checks or hangs encountered during the test are confined to
the isolated core, preventing disruption to the overall system. Like
SAF test, the SBAF test is also divided into multiple batches, and each
batch test can take hundreds of milliseconds (100-200 ms) to complete.
If such a lengthy interruption is undesirable, it is recommended to
relocate the time-sensitive applications to other cores for the
duration of the test.

Patch Details:

Patch 1/4: Refactors MSR usage in IFS image loading code to share the
   code between SBAF and SAF tests.
Patch 2/4: Leverages SAF image loading logic and adds SBAF image loading 
support.
Patch 3/4: Adds support for user to trigger SBAF test.
Patch 4/4: Adds trace support for SBAF tests.

This series is originally authored by Jithu Joseph. I have made cleanups
related to code reuse between the SBAF and SAF tests and resubmitting it for
review.

Changes since v3:
 * Rebased on top of v6.11-rc1
 * Added missing error return value in validate_ifs_metadata().

Changes since v2:
 * Added Reviewed-by tags from Ilpo and Steven.
 * Fixed minor issues raised by Ilpo.

Changes since v1:
 * Addressed trace struct hole issue (Steven)
 * Fixed initialization issue in ifs_sbaf_test_core() (Ilpo)

Jithu Joseph (3):
  platform/x86/intel/ifs: Add SBAF test image loading support
  platform/x86/intel/ifs: Add SBAF test support
  trace: platform/x86/intel/ifs: Add SBAF trace support

Kuppuswamy Sathyanarayanan (1):
  platform/x86/intel/ifs: Refactor MSR usage in IFS test code

 arch/x86/include/asm/msr-index.h |   2 +
 drivers/platform/x86/intel/ifs/ifs.h |  92 -
 include/trace/events/intel_ifs.h |  27 +++
 drivers/platform/x86/intel/ifs/core.c|  33 
 drivers/platform/x86/intel/ifs/load.c|  40 ++--
 drivers/platform/x86/intel/ifs/runtest.c | 233 +++
 6 files changed, 412 insertions(+), 15 deletions(-)

-- 
2.25.1

[PATCH v4 1/4] platform/x86/intel/ifs: Refactor MSR usage in IFS test code

2024-07-31 Thread Kuppuswamy Sathyanarayanan

IFS tests such as Scan at Field (SAF) or Structural Based Functional
Test at Field (SBAF), require the user to load a test image. The image
loading process is similar across these tests, with the only difference
being MSR addresses used. To reuse the code between these tests, remove
the hard coding of MSR addresses and allow the driver to pass the MSR
addresses per IFS test (via driver device data).

Add a new structure named "struct ifs_test_msrs" to specify the
test-specific MSR addresses. Each IFS test will provide this structure,
enabling them to reuse the common code.

This is a preliminary patch in preparation for the addition of SBAF
support.

Reviewed-by: Ashok Raj 
Reviewed-by: Tony Luck 
Reviewed-by: Ilpo Järvinen 
Signed-off-by: Kuppuswamy Sathyanarayanan 

---
 drivers/platform/x86/intel/ifs/ifs.h  | 25 +
 drivers/platform/x86/intel/ifs/core.c |  9 +
 drivers/platform/x86/intel/ifs/load.c | 24 ++--
 3 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/drivers/platform/x86/intel/ifs/ifs.h 
b/drivers/platform/x86/intel/ifs/ifs.h
index 56b9f3e3cf76..738cbc7a5d00 100644
--- a/drivers/platform/x86/intel/ifs/ifs.h
+++ b/drivers/platform/x86/intel/ifs/ifs.h
@@ -266,6 +266,22 @@ struct ifs_test_caps {
int test_num;
 };
 
+/**
+ * struct ifs_test_msrs - MSRs used in IFS tests
+ * @copy_hashes: Copy test hash data
+ * @copy_hashes_status: Status of copied test hash data
+ * @copy_chunks: Copy chunks of the test data
+ * @copy_chunks_status: Status of the copied test data chunks
+ * @test_ctrl: Control the test attributes
+ */
+struct ifs_test_msrs {
+   u32 copy_hashes;
+   u32 copy_hashes_status;
+   u32 copy_chunks;
+   u32 copy_chunks_status;
+   u32 test_ctrl;
+};
+
 /**
  * struct ifs_data - attributes related to intel IFS driver
  * @loaded_version: stores the currently loaded ifs image version.
@@ -299,6 +315,7 @@ struct ifs_work {
 
 struct ifs_device {
const struct ifs_test_caps *test_caps;
+   const struct ifs_test_msrs *test_msrs;
struct ifs_data rw_data;
struct miscdevice misc;
 };
@@ -319,6 +336,14 @@ static inline const struct ifs_test_caps 
*ifs_get_test_caps(struct device *dev)
return d->test_caps;
 }
 
+static inline const struct ifs_test_msrs *ifs_get_test_msrs(struct device *dev)
+{
+   struct miscdevice *m = dev_get_drvdata(dev);
+   struct ifs_device *d = container_of(m, struct ifs_device, misc);
+
+   return d->test_msrs;
+}
+
 extern bool *ifs_pkg_auth;
 int ifs_load_firmware(struct device *dev);
 int do_core_test(int cpu, struct device *dev);
diff --git a/drivers/platform/x86/intel/ifs/core.c 
b/drivers/platform/x86/intel/ifs/core.c
index 33412a584836..f204ebbbf769 100644
--- a/drivers/platform/x86/intel/ifs/core.c
+++ b/drivers/platform/x86/intel/ifs/core.c
@@ -39,9 +39,18 @@ static const struct ifs_test_caps array_test = {
.test_num = IFS_TYPE_ARRAY_BIST,
 };
 
+static const struct ifs_test_msrs scan_msrs = {
+   .copy_hashes = MSR_COPY_SCAN_HASHES,
+   .copy_hashes_status = MSR_SCAN_HASHES_STATUS,
+   .copy_chunks = MSR_AUTHENTICATE_AND_COPY_CHUNK,
+   .copy_chunks_status = MSR_CHUNKS_AUTHENTICATION_STATUS,
+   .test_ctrl = MSR_SAF_CTRL,
+};
+
 static struct ifs_device ifs_devices[] = {
[IFS_TYPE_SAF] = {
.test_caps = &scan_test,
+   .test_msrs = &scan_msrs,
.misc = {
.name = "intel_ifs_0",
.minor = MISC_DYNAMIC_MINOR,
diff --git a/drivers/platform/x86/intel/ifs/load.c 
b/drivers/platform/x86/intel/ifs/load.c
index 39f19cb51749..ad0c107f0922 100644
--- a/drivers/platform/x86/intel/ifs/load.c
+++ b/drivers/platform/x86/intel/ifs/load.c
@@ -118,15 +118,17 @@ static void copy_hashes_authenticate_chunks(struct 
work_struct *work)
union ifs_scan_hashes_status hashes_status;
union ifs_chunks_auth_status chunk_status;
struct device *dev = local_work->dev;
+   const struct ifs_test_msrs *msrs;
int i, num_chunks, chunk_size;
struct ifs_data *ifsd;
u64 linear_addr, base;
u32 err_code;
 
ifsd = ifs_get_data(dev);
+   msrs = ifs_get_test_msrs(dev);
/* run scan hash copy */
-   wrmsrl(MSR_COPY_SCAN_HASHES, ifs_hash_ptr);
-   rdmsrl(MSR_SCAN_HASHES_STATUS, hashes_status.data);
+   wrmsrl(msrs->copy_hashes, ifs_hash_ptr);
+   rdmsrl(msrs->copy_hashes_status, hashes_status.data);
 
/* enumerate the scan image information */
num_chunks = hashes_status.num_chunks;
@@ -147,8 +149,8 @@ static void copy_hashes_authenticate_chunks(struct 
work_struct *work)
linear_addr = base + i * chunk_size;
linear_addr |= i;
 
-   wrmsrl(MSR_AUTHENTICATE_AND_COPY_CHUNK, linear_addr);
-   rdmsrl(MSR_CHUNKS_AUTHENTICATION_STATUS, chunk_status.data);
+

[PATCH v4 2/4] platform/x86/intel/ifs: Add SBAF test image loading support

2024-07-31 Thread Kuppuswamy Sathyanarayanan

From: Jithu Joseph 

Structural Based Functional Test at Field (SBAF) is a new type of
testing that provides comprehensive core test coverage complementing
existing IFS tests like Scan at Field (SAF) or ArrayBist.

SBAF device will appear as a new device instance (intel_ifs_2) under
/sys/devices/virtual/misc. The user interaction necessary to load the
test image and test a particular core is the same as the existing scan
test (intel_ifs_0).

During the loading stage, the driver will look for a file named
ff-mm-ss-.sbft in the /lib/firmware/intel/ifs_2 directory.
The hardware interaction needed for loading the image is similar to
SAF, with the only difference being the MSR addresses used. Reuse the
SAF image loading code, passing the SBAF-specific MSR addresses via
struct ifs_test_msrs in the driver device data.

Unlike SAF, the SBAF test image chunks are further divided into smaller
logical entities called bundles. Since the SBAF test is initiated per
bundle, cache the maximum number of bundles in the current image, which
is used for iterating through bundles during SBAF test execution.

Reviewed-by: Ashok Raj 
Reviewed-by: Tony Luck 
Reviewed-by: Ilpo Järvinen 
Signed-off-by: Jithu Joseph 
Co-developed-by: Kuppuswamy Sathyanarayanan 

Signed-off-by: Kuppuswamy Sathyanarayanan 

---

Changes since v3:
 * Added missing error return in validate_ifs_metadata().

 arch/x86/include/asm/msr-index.h  |  2 ++
 drivers/platform/x86/intel/ifs/ifs.h  | 37 ++-
 drivers/platform/x86/intel/ifs/core.c | 24 +
 drivers/platform/x86/intel/ifs/load.c | 16 +---
 4 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 82c6a4d350e0..a7c06a46fb76 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -247,6 +247,8 @@
 #define MSR_INTEGRITY_CAPS_ARRAY_BIST  
BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT   4
 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST   
BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+#define MSR_INTEGRITY_CAPS_SBAF_BIT8
+#define MSR_INTEGRITY_CAPS_SBAF
BIT(MSR_INTEGRITY_CAPS_SBAF_BIT)
 #define MSR_INTEGRITY_CAPS_SAF_GEN_MASKGENMASK_ULL(10, 9)
 
 #define MSR_LBR_NHM_FROM   0x0680
diff --git a/drivers/platform/x86/intel/ifs/ifs.h 
b/drivers/platform/x86/intel/ifs/ifs.h
index 738cbc7a5d00..600bb8a1b285 100644
--- a/drivers/platform/x86/intel/ifs/ifs.h
+++ b/drivers/platform/x86/intel/ifs/ifs.h
@@ -126,11 +126,38 @@
  * The driver does not make use of this, it only tests one core at a time.
  *
  * .. [#f1] https://github.com/intel/TBD
+ *
+ *
+ * Structural Based Functional Test at Field (SBAF):
+ * 
+ *
+ * SBAF is a new type of testing that provides comprehensive core test
+ * coverage complementing Scan at Field (SAF) testing. SBAF mimics the
+ * manufacturing screening environment and leverages the same test suite.
+ * It makes use of Design For Test (DFT) observation sites and features
+ * to maximize coverage in minimum time.
+ *
+ * Similar to the SAF test, SBAF isolates the core under test from the
+ * rest of the system during execution. Upon completion, the core
+ * seamlessly resets to its pre-test state and resumes normal operation.
+ * Any machine checks or hangs encountered during the test are confined to
+ * the isolated core, preventing disruption to the overall system.
+ *
+ * Like the SAF test, the SBAF test is also divided into multiple batches,
+ * and each batch test can take hundreds of milliseconds (100-200 ms) to
+ * complete. If such a lengthy interruption is undesirable, it is
+ * recommended to relocate the time-sensitive applications to other cores.
  */
 #include 
 #include 
 
 #define MSR_ARRAY_BIST 0x0105
+
+#define MSR_COPY_SBAF_HASHES   0x02b8
+#define MSR_SBAF_HASHES_STATUS 0x02b9
+#define MSR_AUTHENTICATE_AND_COPY_SBAF_CHUNK   0x02ba
+#define MSR_SBAF_CHUNKS_AUTHENTICATION_STATUS  0x02bb
+
 #define MSR_COPY_SCAN_HASHES   0x02c2
 #define MSR_SCAN_HASHES_STATUS 0x02c3
 #define MSR_AUTHENTICATE_AND_COPY_CHUNK0x02c4
@@ -140,6 +167,7 @@
 #define MSR_ARRAY_TRIGGER  0x02d6
 #define MSR_ARRAY_STATUS   0x02d7
 #define MSR_SAF_CTRL   0x04f0
+#define MSR_SBAF_CTRL  0x04f8
 
 #define SCAN_NOT_TESTED0
 #define SCAN_TEST_PASS 1
@@ -147,6 +175,7 @@
 
 #define IFS_TYPE_SAF   0
 #define IFS_TYPE_ARRAY_BIST1
+#define IFS_TYPE_SBAF  2
 
 #define ARRAY_GEN0 0
 #define ARRAY_GEN1 1
@@ -196,7 +225,8 @@ union ifs_chunk

[PATCH v4 3/4] platform/x86/intel/ifs: Add SBAF test support

2024-07-31 Thread Kuppuswamy Sathyanarayanan

From: Jithu Joseph 

In a core, the SBAF test engine is shared between sibling CPUs.

An SBAF test image contains multiple bundles. Each bundle is further
composed of subunits called programs. When a SBAF test (for a particular
core) is triggered by the user, each SBAF bundle from the loaded test
image is executed sequentially on all the threads on the core using
the stop_core_cpuslocked mechanism. Each bundle execution is initiated by
writing to MSR_ACTIVATE_SBAF.

SBAF test bundle execution may be aborted when an interrupt occurs or
if the CPU does not have enough power budget for the test. In these
cases the kernel restarts the test from the aborted bundle. SBAF
execution is not retried if the test fails or if the test makes no
forward progress after 5 retries.

Reviewed-by: Ashok Raj 
Reviewed-by: Tony Luck 
Reviewed-by: Ilpo Järvinen 
Signed-off-by: Jithu Joseph 
Signed-off-by: Kuppuswamy Sathyanarayanan 

---

Changes since v3: 
 * None

Changes since v2:
 * Removed unnecessary parenthesis for 2 * HZ.
 * Changed "Non valid" to Non-valid.
 * Use simplified confition for sbaf_bundle_completed().

 drivers/platform/x86/intel/ifs/ifs.h |  30 +++
 drivers/platform/x86/intel/ifs/runtest.c | 232 +++
 2 files changed, 262 insertions(+)

diff --git a/drivers/platform/x86/intel/ifs/ifs.h 
b/drivers/platform/x86/intel/ifs/ifs.h
index 600bb8a1b285..b261be46bce8 100644
--- a/drivers/platform/x86/intel/ifs/ifs.h
+++ b/drivers/platform/x86/intel/ifs/ifs.h
@@ -157,6 +157,8 @@
 #define MSR_SBAF_HASHES_STATUS 0x02b9
 #define MSR_AUTHENTICATE_AND_COPY_SBAF_CHUNK   0x02ba
 #define MSR_SBAF_CHUNKS_AUTHENTICATION_STATUS  0x02bb
+#define MSR_ACTIVATE_SBAF  0x02bc
+#define MSR_SBAF_STATUS0x02bd
 
 #define MSR_COPY_SCAN_HASHES   0x02c2
 #define MSR_SCAN_HASHES_STATUS 0x02c3
@@ -283,6 +285,34 @@ union ifs_array {
};
 };
 
+/* MSR_ACTIVATE_SBAF bit fields */
+union ifs_sbaf {
+   u64 data;
+   struct {
+   u32 bundle_idx  :9;
+   u32 rsvd1   :5;
+   u32 pgm_idx :2;
+   u32 rsvd2   :16;
+   u32 delay   :31;
+   u32 sigmce  :1;
+   };
+};
+
+/* MSR_SBAF_STATUS bit fields */
+union ifs_sbaf_status {
+   u64 data;
+   struct {
+   u32 bundle_idx  :9;
+   u32 rsvd1   :5;
+   u32 pgm_idx :2;
+   u32 rsvd2   :16;
+   u32 error_code  :8;
+   u32 rsvd3   :21;
+   u32 test_fail   :1;
+   u32 sbaf_status :2;
+   };
+};
+
 /*
  * Driver populated error-codes
  * 0xFD: Test timed out before completing all the chunks.
diff --git a/drivers/platform/x86/intel/ifs/runtest.c 
b/drivers/platform/x86/intel/ifs/runtest.c
index 282e4bfe30da..2a37f009d0b3 100644
--- a/drivers/platform/x86/intel/ifs/runtest.c
+++ b/drivers/platform/x86/intel/ifs/runtest.c
@@ -29,6 +29,13 @@ struct run_params {
union ifs_status status;
 };
 
+struct sbaf_run_params {
+   struct ifs_data *ifsd;
+   int *retry_cnt;
+   union ifs_sbaf *activate;
+   union ifs_sbaf_status status;
+};
+
 /*
  * Number of TSC cycles that a logical CPU will wait for the other
  * logical CPU on the core in the WRMSR(ACTIVATE_SCAN).
@@ -146,6 +153,7 @@ static bool can_restart(union ifs_status status)
 #define SPINUNIT 100 /* 100 nsec */
 static atomic_t array_cpus_in;
 static atomic_t scan_cpus_in;
+static atomic_t sbaf_cpus_in;
 
 /*
  * Simplified cpu sibling rendezvous loop based on microcode loader 
__wait_for_cpus()
@@ -387,6 +395,224 @@ static void ifs_array_test_gen1(int cpu, struct device 
*dev)
ifsd->status = SCAN_TEST_PASS;
 }
 
+#define SBAF_STATUS_PASS   0
+#define SBAF_STATUS_SIGN_FAIL  1
+#define SBAF_STATUS_INTR   2
+#define SBAF_STATUS_TEST_FAIL  3
+
+enum sbaf_status_err_code {
+   IFS_SBAF_NO_ERROR   = 0,
+   IFS_SBAF_OTHER_THREAD_COULD_NOT_JOIN= 1,
+   IFS_SBAF_INTERRUPTED_BEFORE_RENDEZVOUS  = 2,
+   IFS_SBAF_UNASSIGNED_ERROR_CODE3 = 3,
+   IFS_SBAF_INVALID_BUNDLE_INDEX   = 4,
+   IFS_SBAF_MISMATCH_ARGS_BETWEEN_THREADS  = 5,
+   IFS_SBAF_CORE_NOT_CAPABLE_CURRENTLY = 6,
+   IFS_SBAF_UNASSIGNED_ERROR_CODE7 = 7,
+   IFS_SBAF_EXCEED_NUMBER_OF_THREADS_CONCURRENT= 8,
+   IFS_SBAF_INTERRUPTED_DURING_EXECUTION   = 9,
+   IFS_SBAF_INVALID_PROGRAM_INDEX  = 0xA,
+   IFS_SBAF_CORRUPTED_CHUNK= 0xB,
+   IFS_SBAF_DID_NOT_START  = 0xC,
+};
+

[PATCH v4 4/4] trace: platform/x86/intel/ifs: Add SBAF trace support

2024-07-31 Thread Kuppuswamy Sathyanarayanan

From: Jithu Joseph 

Add tracing support for the SBAF IFS tests, which may be useful for
debugging systems that fail these tests. Log details like test content
batch number, SBAF bundle ID, program index and the exact errors or
warnings encountered by each HT thread during the test.

Reviewed-by: Ashok Raj 
Reviewed-by: Tony Luck 
Reviewed-by: Ilpo Järvinen 
Reviewed-by: Steven Rostedt (Google) 
Signed-off-by: Jithu Joseph 
Signed-off-by: Kuppuswamy Sathyanarayanan 

---
 include/trace/events/intel_ifs.h | 27 
 drivers/platform/x86/intel/ifs/runtest.c |  1 +
 2 files changed, 28 insertions(+)

diff --git a/include/trace/events/intel_ifs.h b/include/trace/events/intel_ifs.h
index 0d88ebf2c980..70323acde1de 100644
--- a/include/trace/events/intel_ifs.h
+++ b/include/trace/events/intel_ifs.h
@@ -35,6 +35,33 @@ TRACE_EVENT(ifs_status,
__entry->status)
 );
 
+TRACE_EVENT(ifs_sbaf,
+
+   TP_PROTO(int batch, union ifs_sbaf activate, union ifs_sbaf_status 
status),
+
+   TP_ARGS(batch, activate, status),
+
+   TP_STRUCT__entry(
+   __field(u64,status  )
+   __field(int,batch   )
+   __field(u16,bundle  )
+   __field(u16,pgm )
+   ),
+
+   TP_fast_assign(
+   __entry->status = status.data;
+   __entry->batch  = batch;
+   __entry->bundle = activate.bundle_idx;
+   __entry->pgm= activate.pgm_idx;
+   ),
+
+   TP_printk("batch: 0x%.2x, bundle_idx: 0x%.4x, pgm_idx: 0x%.4x, status: 
0x%.16llx",
+   __entry->batch,
+   __entry->bundle,
+   __entry->pgm,
+   __entry->status)
+);
+
 #endif /* _TRACE_IFS_H */
 
 /* This part must be outside protection */
diff --git a/drivers/platform/x86/intel/ifs/runtest.c 
b/drivers/platform/x86/intel/ifs/runtest.c
index 2a37f009d0b3..7670fc89153d 100644
--- a/drivers/platform/x86/intel/ifs/runtest.c
+++ b/drivers/platform/x86/intel/ifs/runtest.c
@@ -528,6 +528,7 @@ static int dosbaf(void *data)
 */
wrmsrl(MSR_ACTIVATE_SBAF, run_params->activate->data);
rdmsrl(MSR_SBAF_STATUS, status.data);
+   trace_ifs_sbaf(ifsd->cur_batch, *run_params->activate, status);
 
/* Pass back the result of the test */
if (cpu == first)
-- 
2.25.1

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Michael S. Tsirkin

On Thu, Aug 01, 2024 at 10:16:00AM +0800, Jason Wang wrote:
> > > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct virtnet_info 
> > > *vi, struct dim *dim)
> > >   net_dim_work_cancel(dim);
> > >  }
> > >
> > > +static void virtnet_update_settings(struct virtnet_info *vi)
> > > +{
> > > + u32 speed;
> > > + u8 duplex;
> > > +
> > > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > + return;
> > > +
> > > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> > > +
> > > + if (ethtool_validate_speed(speed))
> > > + vi->speed = speed;
> > > +
> > > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, 
> > > &duplex);
> > > +
> > > + if (ethtool_validate_duplex(duplex))
> > > + vi->duplex = duplex;
> > > +}
> > > +
> >
> > I already commented on this approach.  This is now invoked on each open,
> > lots of extra VM exits. No bueno, people are working hard to keep setup
> > overhead under control. Handle this in the config change interrupt -
> > your new infrastructure is perfect for this.
> 
> No, in this version it doesn't. Config space read only happens if
> there's a pending config interrupt during ndo_open:
> 
> +   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> +   if (vi->status & VIRTIO_NET_S_LINK_UP)
> +   netif_carrier_on(vi->dev);
> +   virtio_config_driver_enable(vi->vdev);
> +   } else {
> +   vi->status = VIRTIO_NET_S_LINK_UP;
> +   netif_carrier_on(dev);
> +   virtnet_update_settings(vi);
> +   }

Sorry for being unclear, I was referring to !VIRTIO_NET_F_STATUS.
I do not see why do we need to bother re-reading settings in this case at all,
status is not there, nothing much changes.


> >
> >
> > >  static int virtnet_open(struct net_device *dev)
> > >  {
> > >   struct virtnet_info *vi = netdev_priv(dev);
> > > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> > >   goto err_enable_qp;
> > >   }
> > >
> > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > + netif_carrier_on(vi->dev);
> > > + virtio_config_driver_enable(vi->vdev);
> > > + } else {
> > > + vi->status = VIRTIO_NET_S_LINK_UP;
> > > + netif_carrier_on(dev);
> > > + virtnet_update_settings(vi);
> > > + }
> > > +
> > >   return 0;
> > >
> > >  err_enable_qp:
> > > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
> > >   disable_delayed_refill(vi);
> > >   /* Make sure refill_work doesn't re-enable napi! */
> > >   cancel_delayed_work_sync(&vi->refill);
> > > + /* Make sure config notification doesn't schedule config work */
> >
> > it's clear what this does even without a comment.
> > what you should comment on, and do not, is *why*.
> 
> Well, it just follows the existing style, for example the above said
> 
> "/* Make sure refill_work doesn't re-enable napi! */"

only at the grammar level.
you don't see the difference?

/* Make sure refill_work doesn't re-enable napi! */
cancel_delayed_work_sync(&vi->refill);

it explains why we cancel: to avoid re-enabling napi.

why do you cancel config callback and work?
comment should say that.



> >
> > > + virtio_config_driver_disable(vi->vdev);
> > > + /* Make sure status updating is cancelled */
> >
> > same
> >
> > also what "status updating"? confuses more than this clarifies.
> 
> Does "Make sure the config changed work is cancelled" sounds better?

no, this just repeats what code does.
explain why you cancel it.



-- 
MST

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Michael S. Tsirkin

On Wed, Jul 31, 2024 at 10:59:47AM +0800, Jason Wang wrote:
> This patch synchronize operstate with admin state per RFC2863.
> 
> This is done by trying to toggle the carrier upon open/close and
> synchronize with the config change work. This allows propagate status
> correctly to stacked devices like:
> 
> ip link add link enp0s3 macvlan0 type macvlan
> ip link set link enp0s3 down
> ip link show
> 
> Before this patch:
> 
> 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> DEFAULT group default qlen 1000
> link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> ..
> 5: macvlan0@enp0s3:  mtu 1500 qdisc 
> noqueue state UP mode DEFAULT group default qlen 1000
> link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> 
> After this patch:
> 
> 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> DEFAULT group default qlen 1000
> link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> ...
> 5: macvlan0@enp0s3:  mtu 1500 qdisc 
> noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
> link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> 
> Cc: Venkat Venkatsubra 
> Cc: Gia-Khanh Nguyen 
> Signed-off-by: Jason Wang 
> ---
>  drivers/net/virtio_net.c | 84 ++--
>  1 file changed, 54 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0383a3e136d6..0cb93261eba1 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2878,6 +2878,7 @@ static int virtnet_enable_queue_pair(struct 
> virtnet_info *vi, int qp_index)
>   return err;
>  }
>  
> +
>  static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim)
>  {
>   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
> @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct virtnet_info 
> *vi, struct dim *dim)
>   net_dim_work_cancel(dim);
>  }
>  
> +static void virtnet_update_settings(struct virtnet_info *vi)
> +{
> + u32 speed;
> + u8 duplex;
> +
> + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> + return;
> +
> + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> +
> + if (ethtool_validate_speed(speed))
> + vi->speed = speed;
> +
> + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> +
> + if (ethtool_validate_duplex(duplex))
> + vi->duplex = duplex;
> +}
> +
>  static int virtnet_open(struct net_device *dev)
>  {
>   struct virtnet_info *vi = netdev_priv(dev);
> @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
>   goto err_enable_qp;
>   }
>  
> + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> + if (vi->status & VIRTIO_NET_S_LINK_UP)
> + netif_carrier_on(vi->dev);
> + virtio_config_driver_enable(vi->vdev);
> + } else {
> + vi->status = VIRTIO_NET_S_LINK_UP;
> + netif_carrier_on(dev);
> + virtnet_update_settings(vi);
> + }
> +
>   return 0;
>  
>  err_enable_qp:
> @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
>   disable_delayed_refill(vi);
>   /* Make sure refill_work doesn't re-enable napi! */
>   cancel_delayed_work_sync(&vi->refill);
> + /* Make sure config notification doesn't schedule config work */
> + virtio_config_driver_disable(vi->vdev);
> + /* Make sure status updating is cancelled */
> + cancel_work_sync(&vi->config_work);
>  
>   for (i = 0; i < vi->max_queue_pairs; i++) {
>   virtnet_disable_queue_pair(vi, i);
>   virtnet_cancel_dim(vi, &vi->rq[i].dim);
>   }
>  
> + netif_carrier_off(dev);
> +
>   return 0;
>  }
>  
> @@ -5085,25 +5121,6 @@ static void virtnet_init_settings(struct net_device 
> *dev)
>   vi->duplex = DUPLEX_UNKNOWN;
>  }
>  
> -static void virtnet_update_settings(struct virtnet_info *vi)
> -{
> - u32 speed;
> - u8 duplex;
> -
> - if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> - return;
> -
> - virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> -
> - if (ethtool_validate_speed(speed))
> - vi->speed = speed;
> -
> - virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> -
> - if (ethtool_validate_duplex(duplex))
> - vi->duplex = duplex;
> -}
> -
>  static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
>  {
>   return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size;
> @@ -6514,6 +6531,11 @@ static int virtnet_probe(struct virtio_device *vdev)
>   goto free_failover;
>   }
>  
> + /* Forbid config change notification until ndo_open. */
> + virtio_config_driver_disable(vi->vdev);
> + /* Make sure status updating work is done */

Wait a second, how can anything run here, this is probe,
config change callbacks are never i

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Jason Wang

On Thu, Aug 1, 2024 at 1:58 PM Michael S. Tsirkin  wrote:
>
> On Thu, Aug 01, 2024 at 10:16:00AM +0800, Jason Wang wrote:
> > > > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct 
> > > > virtnet_info *vi, struct dim *dim)
> > > >   net_dim_work_cancel(dim);
> > > >  }
> > > >
> > > > +static void virtnet_update_settings(struct virtnet_info *vi)
> > > > +{
> > > > + u32 speed;
> > > > + u8 duplex;
> > > > +
> > > > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > > + return;
> > > > +
> > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, 
> > > > &speed);
> > > > +
> > > > + if (ethtool_validate_speed(speed))
> > > > + vi->speed = speed;
> > > > +
> > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, 
> > > > &duplex);
> > > > +
> > > > + if (ethtool_validate_duplex(duplex))
> > > > + vi->duplex = duplex;
> > > > +}
> > > > +
> > >
> > > I already commented on this approach.  This is now invoked on each open,
> > > lots of extra VM exits. No bueno, people are working hard to keep setup
> > > overhead under control. Handle this in the config change interrupt -
> > > your new infrastructure is perfect for this.
> >
> > No, in this version it doesn't. Config space read only happens if
> > there's a pending config interrupt during ndo_open:
> >
> > +   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > +   if (vi->status & VIRTIO_NET_S_LINK_UP)
> > +   netif_carrier_on(vi->dev);
> > +   virtio_config_driver_enable(vi->vdev);
> > +   } else {
> > +   vi->status = VIRTIO_NET_S_LINK_UP;
> > +   netif_carrier_on(dev);
> > +   virtnet_update_settings(vi);
> > +   }
>
> Sorry for being unclear, I was referring to !VIRTIO_NET_F_STATUS.
> I do not see why do we need to bother re-reading settings in this case at all,
> status is not there, nothing much changes.

Ok, let me remove it from the next version.

>
>
> > >
> > >
> > > >  static int virtnet_open(struct net_device *dev)
> > > >  {
> > > >   struct virtnet_info *vi = netdev_priv(dev);
> > > > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> > > >   goto err_enable_qp;
> > > >   }
> > > >
> > > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > > + netif_carrier_on(vi->dev);
> > > > + virtio_config_driver_enable(vi->vdev);
> > > > + } else {
> > > > + vi->status = VIRTIO_NET_S_LINK_UP;
> > > > + netif_carrier_on(dev);
> > > > + virtnet_update_settings(vi);
> > > > + }
> > > > +
> > > >   return 0;
> > > >
> > > >  err_enable_qp:
> > > > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
> > > >   disable_delayed_refill(vi);
> > > >   /* Make sure refill_work doesn't re-enable napi! */
> > > >   cancel_delayed_work_sync(&vi->refill);
> > > > + /* Make sure config notification doesn't schedule config work */
> > >
> > > it's clear what this does even without a comment.
> > > what you should comment on, and do not, is *why*.
> >
> > Well, it just follows the existing style, for example the above said
> >
> > "/* Make sure refill_work doesn't re-enable napi! */"
>
> only at the grammar level.
> you don't see the difference?
>
> /* Make sure refill_work doesn't re-enable napi! */
> cancel_delayed_work_sync(&vi->refill);
>
> it explains why we cancel: to avoid re-enabling napi.
>
> why do you cancel config callback and work?
> comment should say that.

Something like "Prevent the config change callback from changing
carrier after close"?

>
>
>
> > >
> > > > + virtio_config_driver_disable(vi->vdev);
> > > > + /* Make sure status updating is cancelled */
> > >
> > > same
> > >
> > > also what "status updating"? confuses more than this clarifies.
> >
> > Does "Make sure the config changed work is cancelled" sounds better?
>
> no, this just repeats what code does.
> explain why you cancel it.

Does something like "Make sure carrier changes have been done by the
config change callback" works?

Thanks

>
>
>
> --
> MST
>

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Jason Wang

On Thu, Aug 1, 2024 at 2:06 PM Michael S. Tsirkin  wrote:
>
> On Wed, Jul 31, 2024 at 10:59:47AM +0800, Jason Wang wrote:
> > This patch synchronize operstate with admin state per RFC2863.
> >
> > This is done by trying to toggle the carrier upon open/close and
> > synchronize with the config change work. This allows propagate status
> > correctly to stacked devices like:
> >
> > ip link add link enp0s3 macvlan0 type macvlan
> > ip link set link enp0s3 down
> > ip link show
> >
> > Before this patch:
> >
> > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> > DEFAULT group default qlen 1000
> > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > ..
> > 5: macvlan0@enp0s3:  mtu 1500 qdisc 
> > noqueue state UP mode DEFAULT group default qlen 1000
> > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> >
> > After this patch:
> >
> > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN mode 
> > DEFAULT group default qlen 1000
> > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > ...
> > 5: macvlan0@enp0s3:  mtu 1500 
> > qdisc noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
> > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> >
> > Cc: Venkat Venkatsubra 
> > Cc: Gia-Khanh Nguyen 
> > Signed-off-by: Jason Wang 
> > ---
> >  drivers/net/virtio_net.c | 84 ++--
> >  1 file changed, 54 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 0383a3e136d6..0cb93261eba1 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -2878,6 +2878,7 @@ static int virtnet_enable_queue_pair(struct 
> > virtnet_info *vi, int qp_index)
> >   return err;
> >  }
> >
> > +
> >  static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim)
> >  {
> >   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
> > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct virtnet_info 
> > *vi, struct dim *dim)
> >   net_dim_work_cancel(dim);
> >  }
> >
> > +static void virtnet_update_settings(struct virtnet_info *vi)
> > +{
> > + u32 speed;
> > + u8 duplex;
> > +
> > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > + return;
> > +
> > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> > +
> > + if (ethtool_validate_speed(speed))
> > + vi->speed = speed;
> > +
> > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> > +
> > + if (ethtool_validate_duplex(duplex))
> > + vi->duplex = duplex;
> > +}
> > +
> >  static int virtnet_open(struct net_device *dev)
> >  {
> >   struct virtnet_info *vi = netdev_priv(dev);
> > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> >   goto err_enable_qp;
> >   }
> >
> > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > + netif_carrier_on(vi->dev);
> > + virtio_config_driver_enable(vi->vdev);
> > + } else {
> > + vi->status = VIRTIO_NET_S_LINK_UP;
> > + netif_carrier_on(dev);
> > + virtnet_update_settings(vi);
> > + }
> > +
> >   return 0;
> >
> >  err_enable_qp:
> > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
> >   disable_delayed_refill(vi);
> >   /* Make sure refill_work doesn't re-enable napi! */
> >   cancel_delayed_work_sync(&vi->refill);
> > + /* Make sure config notification doesn't schedule config work */
> > + virtio_config_driver_disable(vi->vdev);
> > + /* Make sure status updating is cancelled */
> > + cancel_work_sync(&vi->config_work);
> >
> >   for (i = 0; i < vi->max_queue_pairs; i++) {
> >   virtnet_disable_queue_pair(vi, i);
> >   virtnet_cancel_dim(vi, &vi->rq[i].dim);
> >   }
> >
> > + netif_carrier_off(dev);
> > +
> >   return 0;
> >  }
> >
> > @@ -5085,25 +5121,6 @@ static void virtnet_init_settings(struct net_device 
> > *dev)
> >   vi->duplex = DUPLEX_UNKNOWN;
> >  }
> >
> > -static void virtnet_update_settings(struct virtnet_info *vi)
> > -{
> > - u32 speed;
> > - u8 duplex;
> > -
> > - if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > - return;
> > -
> > - virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> > -
> > - if (ethtool_validate_speed(speed))
> > - vi->speed = speed;
> > -
> > - virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
> > -
> > - if (ethtool_validate_duplex(duplex))
> > - vi->duplex = duplex;
> > -}
> > -
> >  static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
> >  {
> >   return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size;
> > @@ -6514,6 +6531,11 @@ static int virtnet_probe(struct virtio_de

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Michael S. Tsirkin

On Thu, Aug 01, 2024 at 02:13:18PM +0800, Jason Wang wrote:
> On Thu, Aug 1, 2024 at 1:58 PM Michael S. Tsirkin  wrote:
> >
> > On Thu, Aug 01, 2024 at 10:16:00AM +0800, Jason Wang wrote:
> > > > > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct 
> > > > > virtnet_info *vi, struct dim *dim)
> > > > >   net_dim_work_cancel(dim);
> > > > >  }
> > > > >
> > > > > +static void virtnet_update_settings(struct virtnet_info *vi)
> > > > > +{
> > > > > + u32 speed;
> > > > > + u8 duplex;
> > > > > +
> > > > > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > > > + return;
> > > > > +
> > > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, 
> > > > > &speed);
> > > > > +
> > > > > + if (ethtool_validate_speed(speed))
> > > > > + vi->speed = speed;
> > > > > +
> > > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, 
> > > > > &duplex);
> > > > > +
> > > > > + if (ethtool_validate_duplex(duplex))
> > > > > + vi->duplex = duplex;
> > > > > +}
> > > > > +
> > > >
> > > > I already commented on this approach.  This is now invoked on each open,
> > > > lots of extra VM exits. No bueno, people are working hard to keep setup
> > > > overhead under control. Handle this in the config change interrupt -
> > > > your new infrastructure is perfect for this.
> > >
> > > No, in this version it doesn't. Config space read only happens if
> > > there's a pending config interrupt during ndo_open:
> > >
> > > +   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > +   if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > +   netif_carrier_on(vi->dev);
> > > +   virtio_config_driver_enable(vi->vdev);
> > > +   } else {
> > > +   vi->status = VIRTIO_NET_S_LINK_UP;
> > > +   netif_carrier_on(dev);
> > > +   virtnet_update_settings(vi);
> > > +   }
> >
> > Sorry for being unclear, I was referring to !VIRTIO_NET_F_STATUS.
> > I do not see why do we need to bother re-reading settings in this case at 
> > all,
> > status is not there, nothing much changes.
> 
> Ok, let me remove it from the next version.
> 
> >
> >
> > > >
> > > >
> > > > >  static int virtnet_open(struct net_device *dev)
> > > > >  {
> > > > >   struct virtnet_info *vi = netdev_priv(dev);
> > > > > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> > > > >   goto err_enable_qp;
> > > > >   }
> > > > >
> > > > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > > > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > > > + netif_carrier_on(vi->dev);
> > > > > + virtio_config_driver_enable(vi->vdev);
> > > > > + } else {
> > > > > + vi->status = VIRTIO_NET_S_LINK_UP;
> > > > > + netif_carrier_on(dev);
> > > > > + virtnet_update_settings(vi);
> > > > > + }
> > > > > +
> > > > >   return 0;
> > > > >
> > > > >  err_enable_qp:
> > > > > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device 
> > > > > *dev)
> > > > >   disable_delayed_refill(vi);
> > > > >   /* Make sure refill_work doesn't re-enable napi! */
> > > > >   cancel_delayed_work_sync(&vi->refill);
> > > > > + /* Make sure config notification doesn't schedule config work */
> > > >
> > > > it's clear what this does even without a comment.
> > > > what you should comment on, and do not, is *why*.
> > >
> > > Well, it just follows the existing style, for example the above said
> > >
> > > "/* Make sure refill_work doesn't re-enable napi! */"
> >
> > only at the grammar level.
> > you don't see the difference?
> >
> > /* Make sure refill_work doesn't re-enable napi! */
> > cancel_delayed_work_sync(&vi->refill);
> >
> > it explains why we cancel: to avoid re-enabling napi.
> >
> > why do you cancel config callback and work?
> > comment should say that.
> 
> Something like "Prevent the config change callback from changing
> carrier after close"?


sounds good.

> >
> >
> >
> > > >
> > > > > + virtio_config_driver_disable(vi->vdev);
> > > > > + /* Make sure status updating is cancelled */
> > > >
> > > > same
> > > >
> > > > also what "status updating"? confuses more than this clarifies.
> > >
> > > Does "Make sure the config changed work is cancelled" sounds better?
> >
> > no, this just repeats what code does.
> > explain why you cancel it.
> 
> Does something like "Make sure carrier changes have been done by the
> config change callback" works?
> 
> Thanks

I don't understand what this means.

> >
> >
> >
> > --
> > MST
> >

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Michael S. Tsirkin

On Thu, Aug 01, 2024 at 02:13:49PM +0800, Jason Wang wrote:
> On Thu, Aug 1, 2024 at 2:06 PM Michael S. Tsirkin  wrote:
> >
> > On Wed, Jul 31, 2024 at 10:59:47AM +0800, Jason Wang wrote:
> > > This patch synchronize operstate with admin state per RFC2863.
> > >
> > > This is done by trying to toggle the carrier upon open/close and
> > > synchronize with the config change work. This allows propagate status
> > > correctly to stacked devices like:
> > >
> > > ip link add link enp0s3 macvlan0 type macvlan
> > > ip link set link enp0s3 down
> > > ip link show
> > >
> > > Before this patch:
> > >
> > > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN 
> > > mode DEFAULT group default qlen 1000
> > > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > > ..
> > > 5: macvlan0@enp0s3:  mtu 1500 
> > > qdisc noqueue state UP mode DEFAULT group default qlen 1000
> > > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> > >
> > > After this patch:
> > >
> > > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN 
> > > mode DEFAULT group default qlen 1000
> > > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > > ...
> > > 5: macvlan0@enp0s3:  mtu 1500 
> > > qdisc noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
> > > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> > >
> > > Cc: Venkat Venkatsubra 
> > > Cc: Gia-Khanh Nguyen 
> > > Signed-off-by: Jason Wang 
> > > ---
> > >  drivers/net/virtio_net.c | 84 ++--
> > >  1 file changed, 54 insertions(+), 30 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 0383a3e136d6..0cb93261eba1 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -2878,6 +2878,7 @@ static int virtnet_enable_queue_pair(struct 
> > > virtnet_info *vi, int qp_index)
> > >   return err;
> > >  }
> > >
> > > +
> > >  static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim)
> > >  {
> > >   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
> > > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct virtnet_info 
> > > *vi, struct dim *dim)
> > >   net_dim_work_cancel(dim);
> > >  }
> > >
> > > +static void virtnet_update_settings(struct virtnet_info *vi)
> > > +{
> > > + u32 speed;
> > > + u8 duplex;
> > > +
> > > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > + return;
> > > +
> > > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> > > +
> > > + if (ethtool_validate_speed(speed))
> > > + vi->speed = speed;
> > > +
> > > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, 
> > > &duplex);
> > > +
> > > + if (ethtool_validate_duplex(duplex))
> > > + vi->duplex = duplex;
> > > +}
> > > +
> > >  static int virtnet_open(struct net_device *dev)
> > >  {
> > >   struct virtnet_info *vi = netdev_priv(dev);
> > > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> > >   goto err_enable_qp;
> > >   }
> > >
> > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > + netif_carrier_on(vi->dev);
> > > + virtio_config_driver_enable(vi->vdev);
> > > + } else {
> > > + vi->status = VIRTIO_NET_S_LINK_UP;
> > > + netif_carrier_on(dev);
> > > + virtnet_update_settings(vi);
> > > + }
> > > +
> > >   return 0;
> > >
> > >  err_enable_qp:
> > > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
> > >   disable_delayed_refill(vi);
> > >   /* Make sure refill_work doesn't re-enable napi! */
> > >   cancel_delayed_work_sync(&vi->refill);
> > > + /* Make sure config notification doesn't schedule config work */
> > > + virtio_config_driver_disable(vi->vdev);
> > > + /* Make sure status updating is cancelled */
> > > + cancel_work_sync(&vi->config_work);
> > >
> > >   for (i = 0; i < vi->max_queue_pairs; i++) {
> > >   virtnet_disable_queue_pair(vi, i);
> > >   virtnet_cancel_dim(vi, &vi->rq[i].dim);
> > >   }
> > >
> > > + netif_carrier_off(dev);
> > > +
> > >   return 0;
> > >  }
> > >
> > > @@ -5085,25 +5121,6 @@ static void virtnet_init_settings(struct 
> > > net_device *dev)
> > >   vi->duplex = DUPLEX_UNKNOWN;
> > >  }
> > >
> > > -static void virtnet_update_settings(struct virtnet_info *vi)
> > > -{
> > > - u32 speed;
> > > - u8 duplex;
> > > -
> > > - if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > - return;
> > > -
> > > - virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
> > > -
> > > - if (ethtool_validate_speed(speed))
> > > - vi->speed = speed;
> > > -
> > > - virtio_cread_le(vi->vdev, struct virtio_net_config, d

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Jason Wang

On Thu, Aug 1, 2024 at 2:42 PM Michael S. Tsirkin  wrote:
>
> On Thu, Aug 01, 2024 at 02:13:18PM +0800, Jason Wang wrote:
> > On Thu, Aug 1, 2024 at 1:58 PM Michael S. Tsirkin  wrote:
> > >
> > > On Thu, Aug 01, 2024 at 10:16:00AM +0800, Jason Wang wrote:
> > > > > > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct 
> > > > > > virtnet_info *vi, struct dim *dim)
> > > > > >   net_dim_work_cancel(dim);
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_update_settings(struct virtnet_info *vi)
> > > > > > +{
> > > > > > + u32 speed;
> > > > > > + u8 duplex;
> > > > > > +
> > > > > > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > > > > + return;
> > > > > > +
> > > > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, 
> > > > > > &speed);
> > > > > > +
> > > > > > + if (ethtool_validate_speed(speed))
> > > > > > + vi->speed = speed;
> > > > > > +
> > > > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, 
> > > > > > &duplex);
> > > > > > +
> > > > > > + if (ethtool_validate_duplex(duplex))
> > > > > > + vi->duplex = duplex;
> > > > > > +}
> > > > > > +
> > > > >
> > > > > I already commented on this approach.  This is now invoked on each 
> > > > > open,
> > > > > lots of extra VM exits. No bueno, people are working hard to keep 
> > > > > setup
> > > > > overhead under control. Handle this in the config change interrupt -
> > > > > your new infrastructure is perfect for this.
> > > >
> > > > No, in this version it doesn't. Config space read only happens if
> > > > there's a pending config interrupt during ndo_open:
> > > >
> > > > +   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > > +   if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > > +   netif_carrier_on(vi->dev);
> > > > +   virtio_config_driver_enable(vi->vdev);
> > > > +   } else {
> > > > +   vi->status = VIRTIO_NET_S_LINK_UP;
> > > > +   netif_carrier_on(dev);
> > > > +   virtnet_update_settings(vi);
> > > > +   }
> > >
> > > Sorry for being unclear, I was referring to !VIRTIO_NET_F_STATUS.
> > > I do not see why do we need to bother re-reading settings in this case at 
> > > all,
> > > status is not there, nothing much changes.
> >
> > Ok, let me remove it from the next version.
> >
> > >
> > >
> > > > >
> > > > >
> > > > > >  static int virtnet_open(struct net_device *dev)
> > > > > >  {
> > > > > >   struct virtnet_info *vi = netdev_priv(dev);
> > > > > > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device 
> > > > > > *dev)
> > > > > >   goto err_enable_qp;
> > > > > >   }
> > > > > >
> > > > > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > > > > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > > > > + netif_carrier_on(vi->dev);
> > > > > > + virtio_config_driver_enable(vi->vdev);
> > > > > > + } else {
> > > > > > + vi->status = VIRTIO_NET_S_LINK_UP;
> > > > > > + netif_carrier_on(dev);
> > > > > > + virtnet_update_settings(vi);
> > > > > > + }
> > > > > > +
> > > > > >   return 0;
> > > > > >
> > > > > >  err_enable_qp:
> > > > > > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device 
> > > > > > *dev)
> > > > > >   disable_delayed_refill(vi);
> > > > > >   /* Make sure refill_work doesn't re-enable napi! */
> > > > > >   cancel_delayed_work_sync(&vi->refill);
> > > > > > + /* Make sure config notification doesn't schedule config work 
> > > > > > */
> > > > >
> > > > > it's clear what this does even without a comment.
> > > > > what you should comment on, and do not, is *why*.
> > > >
> > > > Well, it just follows the existing style, for example the above said
> > > >
> > > > "/* Make sure refill_work doesn't re-enable napi! */"
> > >
> > > only at the grammar level.
> > > you don't see the difference?
> > >
> > > /* Make sure refill_work doesn't re-enable napi! */
> > > cancel_delayed_work_sync(&vi->refill);
> > >
> > > it explains why we cancel: to avoid re-enabling napi.
> > >
> > > why do you cancel config callback and work?
> > > comment should say that.
> >
> > Something like "Prevent the config change callback from changing
> > carrier after close"?
>
>
> sounds good.
>
> > >
> > >
> > >
> > > > >
> > > > > > + virtio_config_driver_disable(vi->vdev);
> > > > > > + /* Make sure status updating is cancelled */
> > > > >
> > > > > same
> > > > >
> > > > > also what "status updating"? confuses more than this clarifies.
> > > >
> > > > Does "Make sure the config changed work is cancelled" sounds better?
> > >
> > > no, this just repeats what code does.
> > > explain why you cancel it.
> >
> > Does something like "Make sure carrier changes have been done by the
> > config change callback" works?

Re: [PATCH V4 net-next 3/3] virtio-net: synchronize operstate with admin state on up/down

2024-07-31 Thread Jason Wang

On Thu, Aug 1, 2024 at 2:43 PM Michael S. Tsirkin  wrote:
>
> On Thu, Aug 01, 2024 at 02:13:49PM +0800, Jason Wang wrote:
> > On Thu, Aug 1, 2024 at 2:06 PM Michael S. Tsirkin  wrote:
> > >
> > > On Wed, Jul 31, 2024 at 10:59:47AM +0800, Jason Wang wrote:
> > > > This patch synchronize operstate with admin state per RFC2863.
> > > >
> > > > This is done by trying to toggle the carrier upon open/close and
> > > > synchronize with the config change work. This allows propagate status
> > > > correctly to stacked devices like:
> > > >
> > > > ip link add link enp0s3 macvlan0 type macvlan
> > > > ip link set link enp0s3 down
> > > > ip link show
> > > >
> > > > Before this patch:
> > > >
> > > > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN 
> > > > mode DEFAULT group default qlen 1000
> > > > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > > > ..
> > > > 5: macvlan0@enp0s3:  mtu 1500 
> > > > qdisc noqueue state UP mode DEFAULT group default qlen 1000
> > > > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> > > >
> > > > After this patch:
> > > >
> > > > 3: enp0s3:  mtu 1500 qdisc pfifo_fast state DOWN 
> > > > mode DEFAULT group default qlen 1000
> > > > link/ether 00:00:05:00:00:09 brd ff:ff:ff:ff:ff:ff
> > > > ...
> > > > 5: macvlan0@enp0s3:  mtu 1500 
> > > > qdisc noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000
> > > > link/ether b2:a9:c5:04:da:53 brd ff:ff:ff:ff:ff:ff
> > > >
> > > > Cc: Venkat Venkatsubra 
> > > > Cc: Gia-Khanh Nguyen 
> > > > Signed-off-by: Jason Wang 
> > > > ---
> > > >  drivers/net/virtio_net.c | 84 ++--
> > > >  1 file changed, 54 insertions(+), 30 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 0383a3e136d6..0cb93261eba1 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -2878,6 +2878,7 @@ static int virtnet_enable_queue_pair(struct 
> > > > virtnet_info *vi, int qp_index)
> > > >   return err;
> > > >  }
> > > >
> > > > +
> > > >  static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim 
> > > > *dim)
> > > >  {
> > > >   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
> > > > @@ -2885,6 +2886,25 @@ static void virtnet_cancel_dim(struct 
> > > > virtnet_info *vi, struct dim *dim)
> > > >   net_dim_work_cancel(dim);
> > > >  }
> > > >
> > > > +static void virtnet_update_settings(struct virtnet_info *vi)
> > > > +{
> > > > + u32 speed;
> > > > + u8 duplex;
> > > > +
> > > > + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
> > > > + return;
> > > > +
> > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, speed, 
> > > > &speed);
> > > > +
> > > > + if (ethtool_validate_speed(speed))
> > > > + vi->speed = speed;
> > > > +
> > > > + virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, 
> > > > &duplex);
> > > > +
> > > > + if (ethtool_validate_duplex(duplex))
> > > > + vi->duplex = duplex;
> > > > +}
> > > > +
> > > >  static int virtnet_open(struct net_device *dev)
> > > >  {
> > > >   struct virtnet_info *vi = netdev_priv(dev);
> > > > @@ -2903,6 +2923,16 @@ static int virtnet_open(struct net_device *dev)
> > > >   goto err_enable_qp;
> > > >   }
> > > >
> > > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > > > + if (vi->status & VIRTIO_NET_S_LINK_UP)
> > > > + netif_carrier_on(vi->dev);
> > > > + virtio_config_driver_enable(vi->vdev);
> > > > + } else {
> > > > + vi->status = VIRTIO_NET_S_LINK_UP;
> > > > + netif_carrier_on(dev);
> > > > + virtnet_update_settings(vi);
> > > > + }
> > > > +
> > > >   return 0;
> > > >
> > > >  err_enable_qp:
> > > > @@ -3381,12 +3411,18 @@ static int virtnet_close(struct net_device *dev)
> > > >   disable_delayed_refill(vi);
> > > >   /* Make sure refill_work doesn't re-enable napi! */
> > > >   cancel_delayed_work_sync(&vi->refill);
> > > > + /* Make sure config notification doesn't schedule config work */
> > > > + virtio_config_driver_disable(vi->vdev);
> > > > + /* Make sure status updating is cancelled */
> > > > + cancel_work_sync(&vi->config_work);
> > > >
> > > >   for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >   virtnet_disable_queue_pair(vi, i);
> > > >   virtnet_cancel_dim(vi, &vi->rq[i].dim);
> > > >   }
> > > >
> > > > + netif_carrier_off(dev);
> > > > +
> > > >   return 0;
> > > >  }
> > > >
> > > > @@ -5085,25 +5121,6 @@ static void virtnet_init_settings(struct 
> > > > net_device *dev)
> > > >   vi->duplex = DUPLEX_UNKNOWN;
> > > >  }
> > > >
> > > > -static void virtnet_update_settings(struct virtnet_info *vi)
> > > > -{
> > > > - u32 speed;
> > > > - u8 duplex;
> > > > -
> > > > - if (!virt

86 matches

Mail list logo