+static __rte_always_inline void
+enqueue_elems_128(struct rte_ring *r, uint32_t prod_head,
+               const void *obj_table, uint32_t n)
+{
+       unsigned int i;
+       const uint32_t size = r->size;
+       uint32_t idx = prod_head & r->mask;
+       rte_int128_t *ring = (rte_int128_t *)&r[1];
+       const rte_int128_t *obj = (const rte_int128_t *)obj_table;
+       if (likely(idx + n < size)) {
+               for (i = 0; i < (n & ~0x1); i += 2, idx += 2)
+                       memcpy((void *)(ring + idx),
+                               (const void *)(obj + i), 32);
+               switch (n & 0x1) {
+               case 1:
+                       memcpy((void *)(ring + idx),
+                               (const void *)(obj + i), 16);
+               }
+       } else {
+               for (i = 0; idx < size; i++, idx++)
+                       memcpy((void *)(ring + idx),
+                               (const void *)(obj + i), 16);
+               /* Start at the beginning */
+               for (idx = 0; i < n; i++, idx++)
+                       memcpy((void *)(ring + idx),
+                               (const void *)(obj + i), 16);
+       }
+}
+
+/* the actual enqueue of elements on the ring.
+ * Placed here since identical code needed in both
+ * single and multi producer enqueue functions.
+ */
+static __rte_always_inline void
+enqueue_elems(struct rte_ring *r, uint32_t prod_head, const void
*obj_table,
+               uint32_t esize, uint32_t num)
+{
+       /* 8B and 16B copies implemented individually to retain
+        * the current performance.
+        */
+       if (esize == 8)
+               enqueue_elems_64(r, prod_head, obj_table, num);
+       else if (esize == 16)
+               enqueue_elems_128(r, prod_head, obj_table, num);
+       else {
+               uint32_t idx, scale, nr_idx, nr_num, nr_size;
+
+               /* Normalize to uint32_t */
+               scale = esize / sizeof(uint32_t);
+               nr_num = num * scale;
+               idx = prod_head & r->mask;
+               nr_idx = idx * scale;
+               nr_size = r->size * scale;
+               enqueue_elems_32(r, nr_size, nr_idx, obj_table, nr_num);
+       }
+}

Following Konstatin's comment on v7, enqueue_elems_128() was modified to
ensure it won't crash if the object is unaligned. Are we sure that this same
problem cannot also occurs with 64b copies on all supported architectures? (I
mean 64b access that is only aligned on 32b)
Konstantin mentioned that the 64b load/store instructions on x86 can handle 
unaligned access. On aarch64, the load/store (non-atomic, which will be used in 
this case) can handle unaligned access.

+ David Christensen to comment for PPC

The vectorized version of memcpy for Power can handle unaligned access as well.

Dave

Reply via email to