> <snip> > > > > > > + > > > > > > +static __rte_always_inline void enqueue_elems_128(struct > > > > > > +rte_ring *r, uint32_t prod_head, const void *obj_table, > > > > > > +uint32_t n) { unsigned int i; const uint32_t size = r->size; > > > > > > +uint32_t idx = prod_head & r->mask; __uint128_t *ring = > > > > > > +(__uint128_t *)&r[1]; const __uint128_t *obj = (const > > > > > > +__uint128_t *)obj_table; if (likely(idx + n < size)) { for (i = > > > > > > +0; i < (n & ~0x1); i += 2, idx += 2) { ring[idx] = obj[i]; > > > > > > +ring[idx + 1] = obj[i + 1]; > > > > > > > > > > > > > > > AFAIK, that implies 16B aligned obj_table... > > > > > Would it always be the case? > > > > I am not sure from the compiler perspective. > > > > At least on Arm architecture, unaligned access (address that is > > > > accessed is not aligned to the size of the data element being > > > > accessed) will result in faults or require additional cycles. So, > > > > aligning on > > 16B should be fine. > > > Further, I would be changing this to use 'rte_int128_t' as '__uint128_t' > > > is > > not defined on 32b systems. > > > > What I am trying to say: with this code we imply new requirement for elems > The only existing use case in DPDK for 16B is the event ring. The event ring > already does similar kind of copy (using 'struct rte_event'). > So, there is no change in expectations for event ring. > For future code, I think this expectation should be fine since it allows for > optimal code. > > > in the ring: when sizeof(elem)==16 it's alignment also has to be at least > > 16. > > Which from my perspective is not ideal. > Any reasoning?
New implicit requirement and inconsistency. Code like that: struct ring_elem {uint64_t a, b;}; .... struct ring_elem elem; rte_ring_dequeue_elem(ring, &elem, sizeof(elem)); might cause a crash. While exactly the same code with: struct ring_elem {uint64_t a, b, c;}; OR struct ring_elem {uint64_t a, b, c, d;}; will work ok. > > > Note that for elem sizes > 16 (24, 32), there is no such constraint. > The rest of them need to be aligned on 4B boundary. However, this should not > affect the existing code. > The code for 8B and 16B is kept as is to ensure the performance is not > affected for the existing code. > > > > > > > > > > > > > > > > > > > > > +} > > > > > > +switch (n & 0x1) { > > > > > > +case 1: > > > > > > +ring[idx++] = obj[i++]; > > > > > > +} > > > > > > +} else { > > > > > > +for (i = 0; idx < size; i++, idx++) ring[idx] = obj[i]; > > > > > > +/* Start at the beginning */ > > > > > > +for (idx = 0; i < n; i++, idx++) ring[idx] = obj[i]; } } > > > > > > + > > > > > > +/* the actual enqueue of elements on the ring. > > > > > > + * Placed here since identical code needed in both > > > > > > + * single and multi producer enqueue functions. > > > > > > + */ > > > > > > +static __rte_always_inline void enqueue_elems(struct rte_ring > > > > > > +*r, uint32_t prod_head, const void > > > > > *obj_table, > > > > > > +uint32_t esize, uint32_t num) > > > > > > +{ > > > > > > +uint32_t idx, nr_idx, nr_num; > > > > > > + > > > > > > +/* 8B and 16B copies implemented individually to retain > > > > > > + * the current performance. > > > > > > + */ > > > > > > +if (esize == 8) > > > > > > +enqueue_elems_64(r, prod_head, obj_table, num); else if (esize > > > > > > +== > > > > > > +16) enqueue_elems_128(r, prod_head, obj_table, num); else { > > > > > > +/* Normalize to uint32_t */ > > > > > > +uint32_t scale = esize / sizeof(uint32_t); nr_num = num * > > > > > > +scale; idx = prod_head & r->mask; nr_idx = idx * scale; > > > > > > +enqueue_elems_32(r, nr_idx, obj_table, nr_num); } } > > > > > > +