From: Björn Töpel <bjorn.to...@intel.com> Allow creation of AF_PACKET V4 rings. Tx and Rx are still disabled.
Signed-off-by: Björn Töpel <bjorn.to...@intel.com> --- include/linux/tpacket4.h | 391 +++++++++++++++++++++++++++++++++++++++++++++++ net/packet/af_packet.c | 262 +++++++++++++++++++++++++++++-- net/packet/internal.h | 4 + 3 files changed, 641 insertions(+), 16 deletions(-) diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h index fcf4c333c78d..44ba38034133 100644 --- a/include/linux/tpacket4.h +++ b/include/linux/tpacket4.h @@ -18,6 +18,12 @@ #define TP4_UMEM_MIN_FRAME_SIZE 2048 #define TP4_KERNEL_HEADROOM 256 /* Headrom for XDP */ +enum tp4_validation { + TP4_VALIDATION_NONE, /* No validation is performed */ + TP4_VALIDATION_IDX, /* Only address to packet buffer is validated */ + TP4_VALIDATION_DESC /* Full descriptor is validated */ +}; + struct tp4_umem { struct pid *pid; struct page **pgs; @@ -31,9 +37,95 @@ struct tp4_umem { unsigned int data_headroom; }; +struct tp4_dma_info { + dma_addr_t dma; + struct page *page; +}; + +struct tp4_queue { + struct tpacket4_desc *ring; + + unsigned int used_idx; + unsigned int last_avail_idx; + unsigned int ring_mask; + unsigned int num_free; + + struct tp4_umem *umem; + struct tp4_dma_info *dma_info; + enum dma_data_direction direction; +}; + +/** + * struct tp4_packet_array - An array of packets/frames + * + * @tp4q: the tp4q associated with this packet array. Flushes and + * populates will operate on this. + * @dev: pointer to the netdevice the queue should be associated with + * @direction: the direction of the DMA channel that is set up. + * @validation: type of validation performed on populate + * @start: the first packet that has not been processed + * @curr: the packet that is currently being processed + * @end: the last packet in the array + * @mask: convenience variable for internal operations on the array + * @items: the actual descriptors to frames/packets that are in the array + **/ +struct tp4_packet_array { + struct tp4_queue *tp4q; + struct device *dev; + enum dma_data_direction direction; + enum tp4_validation validation; + u32 start; + u32 curr; + u32 end; + u32 mask; + struct tpacket4_desc items[0]; +}; + +/** + * struct tp4_frame_set - A view of a packet array consisting of + * one or more frames + * + * @pkt_arr: the packet array this frame set is located in + * @start: the first frame that has not been processed + * @curr: the frame that is currently being processed + * @end: the last frame in the frame set + * + * This frame set can either be one or more frames or a single packet + * consisting of one or more frames. tp4f_ functions with packet in the + * name return a frame set representing a packet, while the other + * tp4f_ functions return one or more frames not taking into account if + * they consitute a packet or not. + **/ +struct tp4_frame_set { + struct tp4_packet_array *pkt_arr; + u32 start; + u32 curr; + u32 end; +}; + /*************** V4 QUEUE OPERATIONS *******************************/ /** + * tp4q_init - Initializas a tp4 queue + * + * @q: Pointer to the tp4 queue structure to be initialized + * @nentries: Number of descriptor entries in the queue + * @umem: Pointer to the umem / packet buffer associated with this queue + * @buffer: Pointer to the memory region where the descriptors will reside + **/ +static inline void tp4q_init(struct tp4_queue *q, unsigned int nentries, + struct tp4_umem *umem, + struct tpacket4_desc *buffer) +{ + q->ring = buffer; + q->used_idx = 0; + q->last_avail_idx = 0; + q->ring_mask = nentries - 1; + q->num_free = 0; + q->umem = umem; +} + +/** * tp4q_umem_new - Creates a new umem (packet buffer) * * @addr: The address to the umem @@ -98,4 +190,303 @@ static inline struct tp4_umem *tp4q_umem_new(unsigned long addr, size_t size, return umem; } +/** + * tp4q_enqueue_from_array - Enqueue entries from packet array to tp4 queue + * + * @a: Pointer to the packet array to enqueue from + * @dcnt: Max number of entries to enqueue + * + * Returns 0 for success or an errno at failure + **/ +static inline int tp4q_enqueue_from_array(struct tp4_packet_array *a, + u32 dcnt) +{ + struct tp4_queue *q = a->tp4q; + unsigned int used_idx = q->used_idx; + struct tpacket4_desc *d = a->items; + int i; + + if (q->num_free < dcnt) + return -ENOSPC; + + q->num_free -= dcnt; + + for (i = 0; i < dcnt; i++) { + unsigned int idx = (used_idx++) & q->ring_mask; + unsigned int didx = (a->start + i) & a->mask; + + q->ring[idx].idx = d[didx].idx; + q->ring[idx].len = d[didx].len; + q->ring[idx].offset = d[didx].offset; + q->ring[idx].error = d[didx].error; + } + + /* Order flags and data */ + smp_wmb(); + + for (i = dcnt - 1; i >= 0; i--) { + unsigned int idx = (q->used_idx + i) & q->ring_mask; + unsigned int didx = (a->start + i) & a->mask; + + q->ring[idx].flags = d[didx].flags & ~TP4_DESC_KERNEL; + } + q->used_idx += dcnt; + + return 0; +} + +/** + * tp4q_disable - Disable a tp4 queue + * + * @dev: Pointer to the netdevice the queue is connected to + * @q: Pointer to the tp4 queue to disable + **/ +static inline void tp4q_disable(struct device *dev, + struct tp4_queue *q) +{ + int i; + + if (q->dma_info) { + /* Unmap DMA */ + for (i = 0; i < q->umem->npgs; i++) + dma_unmap_page(dev, q->dma_info[i].dma, PAGE_SIZE, + q->direction); + + kfree(q->dma_info); + q->dma_info = NULL; + } +} + +/** + * tp4q_enable - Enable a tp4 queue + * + * @dev: Pointer to the netdevice the queue should be associated with + * @q: Pointer to the tp4 queue to enable + * @direction: The direction of the DMA channel that is set up. + * + * Returns 0 for success or a negative errno for failure + **/ +static inline int tp4q_enable(struct device *dev, + struct tp4_queue *q, + enum dma_data_direction direction) +{ + int i, j; + + /* DMA map all the buffers in bufs up front, and sync prior + * kicking userspace. Is this sane? Strictly user land owns + * the buffer until they show up on the avail queue. However, + * mapping should be ok. + */ + if (direction != DMA_NONE) { + q->dma_info = kcalloc(q->umem->npgs, sizeof(*q->dma_info), + GFP_KERNEL); + if (!q->dma_info) + return -ENOMEM; + + for (i = 0; i < q->umem->npgs; i++) { + dma_addr_t dma; + + dma = dma_map_page(dev, q->umem->pgs[i], 0, + PAGE_SIZE, direction); + if (dma_mapping_error(dev, dma)) { + for (j = 0; j < i; j++) + dma_unmap_page(dev, + q->dma_info[j].dma, + PAGE_SIZE, direction); + kfree(q->dma_info); + q->dma_info = NULL; + return -EBUSY; + } + + q->dma_info[i].page = q->umem->pgs[i]; + q->dma_info[i].dma = dma; + } + } else { + q->dma_info = NULL; + } + + q->direction = direction; + return 0; +} + +/*************** FRAME OPERATIONS *******************************/ +/* A frame is always just one frame of size frame_size. + * A frame set is one or more frames. + **/ + +/** + * tp4f_next_frame - Go to next frame in frame set + * @p: pointer to frame set + * + * Returns true if there is another frame in the frame set. + * Advances curr pointer. + **/ +static inline bool tp4f_next_frame(struct tp4_frame_set *p) +{ + if (p->curr + 1 == p->end) + return false; + + p->curr++; + return true; +} + +/** + * tp4f_set_frame - Sets the properties of a frame + * @p: pointer to frame + * @len: the length in bytes of the data in the frame + * @offset: offset to start of data in frame + * @is_eop: Set if this is the last frame of the packet + **/ +static inline void tp4f_set_frame(struct tp4_frame_set *p, u32 len, u16 offset, + bool is_eop) +{ + struct tpacket4_desc *d = + &p->pkt_arr->items[p->curr & p->pkt_arr->mask]; + + d->len = len; + d->offset = offset; + if (!is_eop) + d->flags |= TP4_PKT_CONT; +} + +/**************** PACKET_ARRAY FUNCTIONS ********************************/ + +static inline struct tp4_packet_array *__tp4a_new( + struct tp4_queue *tp4q, + struct device *dev, + enum dma_data_direction direction, + enum tp4_validation validation, + size_t elems) +{ + struct tp4_packet_array *arr; + int err; + + if (!is_power_of_2(elems)) + return NULL; + + arr = kzalloc(sizeof(*arr) + elems * sizeof(struct tpacket4_desc), + GFP_KERNEL); + if (!arr) + return NULL; + + err = tp4q_enable(dev, tp4q, direction); + if (err) { + kfree(arr); + return NULL; + } + + arr->tp4q = tp4q; + arr->dev = dev; + arr->direction = direction; + arr->validation = validation; + arr->mask = elems - 1; + return arr; +} + +/** + * tp4a_rx_new - Create new packet array for ingress + * @rx_opaque: opaque from tp4_netdev_params + * @elems: number of elements in the packet array + * @dev: device or NULL + * + * Returns a reference to the new packet array or NULL for failure + **/ +static inline struct tp4_packet_array *tp4a_rx_new(void *rx_opaque, + size_t elems, + struct device *dev) +{ + enum dma_data_direction direction = dev ? DMA_FROM_DEVICE : DMA_NONE; + + return __tp4a_new(rx_opaque, dev, direction, TP4_VALIDATION_IDX, + elems); +} + +/** + * tp4a_tx_new - Create new packet array for egress + * @tx_opaque: opaque from tp4_netdev_params + * @elems: number of elements in the packet array + * @dev: device or NULL + * + * Returns a reference to the new packet array or NULL for failure + **/ +static inline struct tp4_packet_array *tp4a_tx_new(void *tx_opaque, + size_t elems, + struct device *dev) +{ + enum dma_data_direction direction = dev ? DMA_TO_DEVICE : DMA_NONE; + + return __tp4a_new(tx_opaque, dev, direction, TP4_VALIDATION_DESC, + elems); +} + +/** + * tp4a_get_flushable_frame_set - Create a frame set of the flushable region + * @a: pointer to packet array + * @p: frame set + * + * Returns true for success and false for failure + **/ +static inline bool tp4a_get_flushable_frame_set(struct tp4_packet_array *a, + struct tp4_frame_set *p) +{ + u32 avail = a->curr - a->start; + + if (avail == 0) + return false; /* empty */ + + p->pkt_arr = a; + p->start = a->start; + p->curr = a->start; + p->end = a->curr; + + return true; +} + +/** + * tp4a_flush - Flush processed packets to associated tp4q + * @a: pointer to packet array + * + * Returns 0 for success and -1 for failure + **/ +static inline int tp4a_flush(struct tp4_packet_array *a) +{ + u32 avail = a->curr - a->start; + int ret; + + if (avail == 0) + return 0; /* nothing to flush */ + + ret = tp4q_enqueue_from_array(a, avail); + if (ret < 0) + return -1; + + a->start = a->curr; + + return 0; +} + +/** + * tp4a_free - Destroy packet array + * @a: pointer to packet array + **/ +static inline void tp4a_free(struct tp4_packet_array *a) +{ + struct tp4_frame_set f; + + if (a) { + /* Flush all outstanding requests. */ + if (tp4a_get_flushable_frame_set(a, &f)) { + do { + tp4f_set_frame(&f, 0, 0, true); + } while (tp4f_next_frame(&f)); + } + + WARN_ON_ONCE(tp4a_flush(a)); + + tp4q_disable(a->dev, a->tp4q); + } + + kfree(a); +} + #endif /* _LINUX_TPACKET4_H */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b39be424ec0e..190598eb3461 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -189,6 +189,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) +#define RX_RING 0 +#define TX_RING 1 + struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); @@ -244,6 +247,9 @@ struct packet_skb_cb { static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); +static void packet_v4_ring_free(struct sock *sk, int tx_ring); +static int packet_v4_ring_new(struct sock *sk, struct tpacket_req4 *req, + int tx_ring); static int packet_direct_xmit(struct sk_buff *skb) { @@ -2206,6 +2212,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, sk = pt->af_packet_priv; po = pkt_sk(sk); + if (po->tp_version == TPACKET_V4) + goto drop; + if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; @@ -2973,10 +2982,14 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - if (po->tx_ring.pg_vec) + if (po->tx_ring.pg_vec) { + if (po->tp_version == TPACKET_V4) + return -EINVAL; + return tpacket_snd(po, msg); - else - return packet_snd(sock, msg, len); + } + + return packet_snd(sock, msg, len); } static void @@ -3105,6 +3118,25 @@ packet_umem_new(unsigned long addr, size_t size, unsigned int frame_size, return ret < 0 ? ERR_PTR(ret) : umem; } +static void packet_clear_ring(struct sock *sk, int tx_ring) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_ring_buffer *rb; + union tpacket_req_u req_u; + + rb = tx_ring ? &po->tx_ring : &po->rx_ring; + if (!rb->pg_vec) + return; + + if (po->tp_version == TPACKET_V4) { + packet_v4_ring_free(sk, tx_ring); + return; + } + + memset(&req_u, 0, sizeof(req_u)); + packet_set_ring(sk, &req_u, 1, tx_ring); +} + /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. @@ -3116,7 +3148,6 @@ static int packet_release(struct socket *sock) struct packet_sock *po; struct packet_fanout *f; struct net *net; - union tpacket_req_u req_u; if (!sk) return 0; @@ -3144,15 +3175,8 @@ static int packet_release(struct socket *sock) packet_flush_mclist(sk); - if (po->rx_ring.pg_vec) { - memset(&req_u, 0, sizeof(req_u)); - packet_set_ring(sk, &req_u, 1, 0); - } - - if (po->tx_ring.pg_vec) { - memset(&req_u, 0, sizeof(req_u)); - packet_set_ring(sk, &req_u, 1, 1); - } + packet_clear_ring(sk, TX_RING); + packet_clear_ring(sk, RX_RING); if (po->umem) { packet_umem_free(po->umem); @@ -3786,16 +3810,24 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv len = sizeof(req_u.req); break; case TPACKET_V3: - default: len = sizeof(req_u.req3); break; + case TPACKET_V4: + default: + len = sizeof(req_u.req4); + break; } if (optlen < len) return -EINVAL; if (copy_from_user(&req_u.req, optval, len)) return -EFAULT; - return packet_set_ring(sk, &req_u, 0, - optname == PACKET_TX_RING); + + if (po->tp_version == TPACKET_V4) + return packet_v4_ring_new(sk, &req_u.req4, + optname == PACKET_TX_RING); + else + return packet_set_ring(sk, &req_u, 0, + optname == PACKET_TX_RING); } case PACKET_COPY_THRESH: { @@ -3821,6 +3853,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: + case TPACKET_V4: break; default: return -EINVAL; @@ -4061,6 +4094,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; + case TPACKET_V4: + val = 0; + break; default: return -EINVAL; } @@ -4247,6 +4283,9 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, struct packet_sock *po = pkt_sk(sk); unsigned int mask = datagram_poll(file, sock, wait); + if (po->tp_version == TPACKET_V4) + return mask; + spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, @@ -4363,6 +4402,197 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) goto out; } +static struct socket * +packet_v4_umem_sock_get(int fd) +{ + struct { + struct sockaddr_ll sa; + char buf[MAX_ADDR_LEN]; + } uaddr; + int uaddr_len = sizeof(uaddr), r; + struct socket *sock = sockfd_lookup(fd, &r); + + if (!sock) + return ERR_PTR(-ENOTSOCK); + + /* Parameter checking */ + if (sock->sk->sk_type != SOCK_RAW) { + r = -ESOCKTNOSUPPORT; + goto err; + } + + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, + &uaddr_len, 0); + if (r) + goto err; + + if (uaddr.sa.sll_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto err; + } + + if (!pkt_sk(sock->sk)->umem) { + r = -ESOCKTNOSUPPORT; + goto err; + } + + return sock; +err: + sockfd_put(sock); + return ERR_PTR(r); +} + +#define TP4_ARRAY_SIZE 32 + +static int +packet_v4_ring_new(struct sock *sk, struct tpacket_req4 *req, int tx_ring) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_ring_buffer *rb; + struct sk_buff_head *rb_queue; + int was_running, order = 0; + struct socket *mrsock; + struct tpacket_req r; + struct pgv *pg_vec; + size_t rb_size; + __be16 num; + int err; + + if (req->desc_nr == 0) + return -EINVAL; + + lock_sock(sk); + + rb = tx_ring ? &po->tx_ring : &po->rx_ring; + rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + err = -EBUSY; + if (atomic_read(&po->mapped)) + goto out; + if (packet_read_pending(rb)) + goto out; + if (unlikely(rb->pg_vec)) + goto out; + + err = -EINVAL; + if (po->tp_version != TPACKET_V4) + goto out; + + po->tp_hdrlen = 0; + + rb_size = req->desc_nr * sizeof(struct tpacket4_desc); + if (unlikely(!rb_size)) + goto out; + + err = -ENOMEM; + order = get_order(rb_size); + + r.tp_block_nr = 1; + pg_vec = alloc_pg_vec(&r, order); + if (unlikely(!pg_vec)) + goto out; + + mrsock = packet_v4_umem_sock_get(req->mr_fd); + if (IS_ERR(mrsock)) { + err = PTR_ERR(mrsock); + free_pg_vec(pg_vec, order, 1); + goto out; + } + + /* Check if umem is from this socket, if so don't make + * circular references. + */ + if (sk->sk_socket == mrsock) + sockfd_put(mrsock); + + spin_lock(&po->bind_lock); + was_running = po->running; + num = po->num; + if (was_running) { + po->num = 0; + __unregister_prot_hook(sk, false); + } + spin_unlock(&po->bind_lock); + + synchronize_net(); + + mutex_lock(&po->pg_vec_lock); + spin_lock_bh(&rb_queue->lock); + + rb->pg_vec = pg_vec; + rb->head = 0; + rb->frame_max = req->desc_nr - 1; + rb->mrsock = mrsock; + tp4q_init(&rb->tp4q, req->desc_nr, pkt_sk(mrsock->sk)->umem, + (struct tpacket4_desc *)rb->pg_vec->buffer); + spin_unlock_bh(&rb_queue->lock); + + rb->tp4a = tx_ring ? tp4a_tx_new(&rb->tp4q, TP4_ARRAY_SIZE, NULL) + : tp4a_rx_new(&rb->tp4q, TP4_ARRAY_SIZE, NULL); + + if (!rb->tp4a) { + err = -ENOMEM; + goto out; + } + + rb->pg_vec_order = order; + rb->pg_vec_len = 1; + rb->pg_vec_pages = PAGE_ALIGN(rb_size) / PAGE_SIZE; + + po->prot_hook.func = po->rx_ring.pg_vec ? tpacket_rcv : packet_rcv; + skb_queue_purge(rb_queue); + + mutex_unlock(&po->pg_vec_lock); + + spin_lock(&po->bind_lock); + if (was_running && po->prot_hook.dev) { + /* V4 requires a bound socket, so only rebind if + * ifindex > 0 / !dev + */ + po->num = num; + register_prot_hook(sk); + } + spin_unlock(&po->bind_lock); + + err = 0; +out: + release_sock(sk); + return err; +} + +static void +packet_v4_ring_free(struct sock *sk, int tx_ring) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_ring_buffer *rb; + struct sk_buff_head *rb_queue; + + lock_sock(sk); + + rb = tx_ring ? &po->tx_ring : &po->rx_ring; + rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + spin_lock(&po->bind_lock); + unregister_prot_hook(sk, true); + spin_unlock(&po->bind_lock); + + mutex_lock(&po->pg_vec_lock); + spin_lock_bh(&rb_queue->lock); + + if (rb->pg_vec) { + free_pg_vec(rb->pg_vec, rb->pg_vec_order, rb->pg_vec_len); + rb->pg_vec = NULL; + } + if (rb->mrsock && sk->sk_socket != rb->mrsock) + sockfd_put(rb->mrsock); + tp4a_free(rb->tp4a); + + spin_unlock_bh(&rb_queue->lock); + skb_queue_purge(rb_queue); + mutex_unlock(&po->pg_vec_lock); + release_sock(sk); +} + static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { diff --git a/net/packet/internal.h b/net/packet/internal.h index 9c07cfe1b8a3..3eedab29e4d7 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -71,6 +71,10 @@ struct packet_ring_buffer { unsigned int __percpu *pending_refcnt; struct tpacket_kbdq_core prb_bdqc; + + struct tp4_packet_array *tp4a; + struct tp4_queue tp4q; + struct socket *mrsock; }; extern struct mutex fanout_mutex; -- 2.11.0