On 12/7/20 2:06 PM, Boris Pismenny wrote: > diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h > index 934de56644e7..fb35dcac03d2 100644 > --- a/include/linux/netdev_features.h > +++ b/include/linux/netdev_features.h > @@ -84,6 +84,7 @@ enum { > NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */ > > NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */ > + NETIF_F_HW_TCP_DDP_BIT, /* TCP direct data placement offload */ > > /* > * Add your fresh new feature above and remember to update > @@ -157,6 +158,7 @@ enum { > #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST) > #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST) > #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC) > +#define NETIF_F_HW_TCP_DDP __NETIF_F(HW_TCP_DDP)
All of the DDP naming seems wrong to me. I realize the specific use case is targeted payloads of a ULP, but it is still S/W handing H/W specific buffers for a payload of a flow. > > /* Finds the next feature with the highest number of the range of start till > 0. > */ > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index a07c8e431f45..755766976408 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -934,6 +934,7 @@ struct dev_ifalias { > > struct devlink; > struct tlsdev_ops; > +struct tcp_ddp_dev_ops; > > struct netdev_name_node { > struct hlist_node hlist; > @@ -1930,6 +1931,10 @@ struct net_device { > const struct tlsdev_ops *tlsdev_ops; > #endif > > +#ifdef CONFIG_TCP_DDP > + const struct tcp_ddp_dev_ops *tcp_ddp_ops; > +#endif > + > const struct header_ops *header_ops; > > unsigned int flags; > diff --git a/include/net/inet_connection_sock.h > b/include/net/inet_connection_sock.h > index 7338b3865a2a..a08b85b53aa8 100644 > --- a/include/net/inet_connection_sock.h > +++ b/include/net/inet_connection_sock.h > @@ -66,6 +66,8 @@ struct inet_connection_sock_af_ops { > * @icsk_ulp_ops Pluggable ULP control hook > * @icsk_ulp_data ULP private data > * @icsk_clean_acked Clean acked data hook > + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook > + * @icsk_ulp_ddp_data ULP direct data placement private data Neither of these socket layer intrusions are needed. All references but 1 -- the skbuff check -- are in the mlx5 driver. Any skb check that is needed can be handled with a different setting. > * @icsk_listen_portaddr_node hash to the portaddr listener hashtable > * @icsk_ca_state: Congestion control state > * @icsk_retransmits: Number of unrecovered [RTO] timeouts > @@ -94,6 +96,8 @@ struct inet_connection_sock { > const struct tcp_ulp_ops *icsk_ulp_ops; > void __rcu *icsk_ulp_data; > void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); > + const struct tcp_ddp_ulp_ops *icsk_ulp_ddp_ops; > + void __rcu *icsk_ulp_ddp_data; > struct hlist_node icsk_listen_portaddr_node; > unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); > __u8 icsk_ca_state:5, > diff --git a/include/net/tcp_ddp.h b/include/net/tcp_ddp.h > new file mode 100644 > index 000000000000..df3264be4600 > --- /dev/null > +++ b/include/net/tcp_ddp.h > @@ -0,0 +1,129 @@ > +/* SPDX-License-Identifier: GPL-2.0 > + * > + * tcp_ddp.h > + * Author: Boris Pismenny <bor...@mellanox.com> > + * Copyright (C) 2020 Mellanox Technologies. > + */ > +#ifndef _TCP_DDP_H > +#define _TCP_DDP_H > + > +#include <linux/netdevice.h> > +#include <net/inet_connection_sock.h> > +#include <net/sock.h> > + > +/* limits returned by the offload driver, zero means don't care */ > +struct tcp_ddp_limits { > + int max_ddp_sgl_len; > +}; > + > +enum tcp_ddp_type { > + TCP_DDP_NVME = 1, > +}; > + > +/** > + * struct tcp_ddp_config - Generic tcp ddp configuration: tcp ddp IO queue > + * config implementations must use this as the first member. > + * Add new instances of tcp_ddp_config below (nvme-tcp, etc.). > + */ > +struct tcp_ddp_config { > + enum tcp_ddp_type type; > + unsigned char buf[]; you have this variable length buf, but it is not used (as far as I can tell). But then ... > +}; > + > +/** > + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue > + * > + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) > + * @cpda: controller pdu data alignmend (dwords, 0's based) > + * @dgst: digest types enabled. > + * The netdev will offload crc if ddp_crc is supported. > + * @queue_size: number of nvme-tcp IO queue elements > + * @queue_id: queue identifier > + * @cpu_io: cpu core running the IO thread for this queue > + */ > +struct nvme_tcp_ddp_config { > + struct tcp_ddp_config cfg; ... how would you use it within another struct like this? > + > + u16 pfv; > + u8 cpda; > + u8 dgst; > + int queue_size; > + int queue_id; > + int io_cpu; > +}; > + > +/** > + * struct tcp_ddp_io - tcp ddp configuration for an IO request. > + * > + * @command_id: identifier on the wire associated with these buffers > + * @nents: number of entries in the sg_table > + * @sg_table: describing the buffers for this IO request > + * @first_sgl: first SGL in sg_table > + */ > +struct tcp_ddp_io { > + u32 command_id; > + int nents; > + struct sg_table sg_table; > + struct scatterlist first_sgl[SG_CHUNK_SIZE]; > +}; > + > +/* struct tcp_ddp_dev_ops - operations used by an upper layer protocol to > configure ddp offload > + * > + * @tcp_ddp_limits: limit the number of scatter gather entries per IO. > + * the device driver can use this to limit the resources > allocated per queue. > + * @tcp_ddp_sk_add: add offload for the queue represennted by the > socket+config pair. > + * this function is used to configure either copy, crc > or both offloads. > + * @tcp_ddp_sk_del: remove offload from the socket, and release any > device related resources. > + * @tcp_ddp_setup: request copy offload for buffers associated with a > command_id in tcp_ddp_io. > + * @tcp_ddp_teardown: release offload resources association between buffers > and command_id in > + * tcp_ddp_io. > + * @tcp_ddp_resync: respond to the driver's resync_request. Called only > if resync is successful. > + */ > +struct tcp_ddp_dev_ops { > + int (*tcp_ddp_limits)(struct net_device *netdev, > + struct tcp_ddp_limits *limits); > + int (*tcp_ddp_sk_add)(struct net_device *netdev, > + struct sock *sk, > + struct tcp_ddp_config *config); > + void (*tcp_ddp_sk_del)(struct net_device *netdev, > + struct sock *sk); > + int (*tcp_ddp_setup)(struct net_device *netdev, > + struct sock *sk, > + struct tcp_ddp_io *io); > + int (*tcp_ddp_teardown)(struct net_device *netdev, > + struct sock *sk, > + struct tcp_ddp_io *io, > + void *ddp_ctx); > + void (*tcp_ddp_resync)(struct net_device *netdev, > + struct sock *sk, u32 seq); > +}; > + > +#define TCP_DDP_RESYNC_REQ (1 << 0) > + > +/** > + * struct tcp_ddp_ulp_ops - Interface to register uppper layer Direct Data > Placement (DDP) TCP offload > + */ > +struct tcp_ddp_ulp_ops { > + /* NIC requests ulp to indicate if @seq is the start of a message */ > + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags); > + /* NIC driver informs the ulp that ddp teardown is done - used for > async completions*/ > + void (*ddp_teardown_done)(void *ddp_ctx); > +}; > + > +/** > + * struct tcp_ddp_ctx - Generic tcp ddp context: device driver per queue > contexts must > + * use this as the first member. > + */ > +struct tcp_ddp_ctx { > + enum tcp_ddp_type type; > + unsigned char buf[]; similar to my comment above, I did not see any uses of the buf element.