<snip> > > > > > + > > > > > +static __rte_always_inline void enqueue_elems_128(struct > > > > > +rte_ring *r, uint32_t prod_head, const void *obj_table, > > > > > +uint32_t n) { unsigned int i; const uint32_t size = r->size; > > > > > +uint32_t idx = prod_head & r->mask; __uint128_t *ring = > > > > > +(__uint128_t *)&r[1]; const __uint128_t *obj = (const > > > > > +__uint128_t *)obj_table; if (likely(idx + n < size)) { for (i = > > > > > +0; i < (n & ~0x1); i += 2, idx += 2) { ring[idx] = obj[i]; > > > > > +ring[idx + 1] = obj[i + 1]; > > > > > > > > > > > > AFAIK, that implies 16B aligned obj_table... > > > > Would it always be the case? > > > I am not sure from the compiler perspective. > > > At least on Arm architecture, unaligned access (address that is > > > accessed is not aligned to the size of the data element being > > > accessed) will result in faults or require additional cycles. So, > > > aligning on > 16B should be fine. > > Further, I would be changing this to use 'rte_int128_t' as '__uint128_t' is > not defined on 32b systems. > > What I am trying to say: with this code we imply new requirement for elems The only existing use case in DPDK for 16B is the event ring. The event ring already does similar kind of copy (using 'struct rte_event'). So, there is no change in expectations for event ring. For future code, I think this expectation should be fine since it allows for optimal code.
> in the ring: when sizeof(elem)==16 it's alignment also has to be at least 16. > Which from my perspective is not ideal. Any reasoning? > Note that for elem sizes > 16 (24, 32), there is no such constraint. The rest of them need to be aligned on 4B boundary. However, this should not affect the existing code. The code for 8B and 16B is kept as is to ensure the performance is not affected for the existing code. > > > > > > > > > > > > > > > +} > > > > > +switch (n & 0x1) { > > > > > +case 1: > > > > > +ring[idx++] = obj[i++]; > > > > > +} > > > > > +} else { > > > > > +for (i = 0; idx < size; i++, idx++) ring[idx] = obj[i]; > > > > > +/* Start at the beginning */ > > > > > +for (idx = 0; i < n; i++, idx++) ring[idx] = obj[i]; } } > > > > > + > > > > > +/* the actual enqueue of elements on the ring. > > > > > + * Placed here since identical code needed in both > > > > > + * single and multi producer enqueue functions. > > > > > + */ > > > > > +static __rte_always_inline void enqueue_elems(struct rte_ring > > > > > +*r, uint32_t prod_head, const void > > > > *obj_table, > > > > > +uint32_t esize, uint32_t num) > > > > > +{ > > > > > +uint32_t idx, nr_idx, nr_num; > > > > > + > > > > > +/* 8B and 16B copies implemented individually to retain > > > > > + * the current performance. > > > > > + */ > > > > > +if (esize == 8) > > > > > +enqueue_elems_64(r, prod_head, obj_table, num); else if (esize > > > > > +== > > > > > +16) enqueue_elems_128(r, prod_head, obj_table, num); else { > > > > > +/* Normalize to uint32_t */ > > > > > +uint32_t scale = esize / sizeof(uint32_t); nr_num = num * > > > > > +scale; idx = prod_head & r->mask; nr_idx = idx * scale; > > > > > +enqueue_elems_32(r, nr_idx, obj_table, nr_num); } } > > > > > +