27/09/2023 17:08, Paul Szczepanek: > Add a new utility header for compressing pointers. Pointers are > compressed by taking advantage of their locality. Instead of > storing the full address only an offset from a known base is stored.
You probably need to insert some explanations from the cover letter. > The provided functions can store pointers in 32bit offsets. > > Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com> > Signed-off-by: Paul Szczepanek <paul.szczepa...@arm.com> > Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.alig...@arm.com> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com> [...] > --- a/lib/eal/include/meson.build > +++ b/lib/eal/include/meson.build > @@ -35,6 +35,7 @@ headers += files( > 'rte_pci_dev_feature_defs.h', > 'rte_pci_dev_features.h', > 'rte_per_lcore.h', > + 'rte_ptr_compress.h', > 'rte_pflock.h', > 'rte_random.h', > 'rte_reciprocal.h', Did you try to sort alphabetically? failed :) > +#ifndef _RTE_PTR_COMPRESS_H_ > +#define _RTE_PTR_COMPRESS_H_ No need extra underscores. > + > +/** > + * @file > + * RTE pointer compression and decompression. RTE has no mean here I think. > + */ > + > +#include <stdint.h> > +#include <inttypes.h> > + > +#include <rte_branch_prediction.h> > +#include <rte_common.h> > +#include <rte_debug.h> > +#include <rte_vect.h> > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +/** > + * Compress pointers into 32 bit offsets from base pointer. I think it should be "32-bit". > + * > + * @note Offsets from the base pointer must fit within 32bits. Alignment > allows > + * us to drop bits from the offsets - this means that for pointers aligned by > + * 8 bytes they must be within 32GB of the base pointer. Unaligned pointers > + * must be within 4GB. Not clear what is "alignment". > + * > + * @param ptr_base > + * A pointer used to calculate offsets of pointers in src_table. > + * @param src_table > + * A pointer to an array of pointers. > + * @param dest_table > + * A pointer to an array of compressed pointers returned by this function. > + * @param n > + * The number of objects to compress, must be strictly positive. > + * @param bit_shift > + * Byte alignment of memory pointed to by the pointers allows for > + * bits to be dropped from the offset and hence widen the memory region > that > + * can be covered. This controls how many bits are right shifted. > + **/ > +static __rte_always_inline void > +rte_ptr_compress_32(void *ptr_base, void **src_table, > + uint32_t *dest_table, unsigned int n, unsigned int bit_shift) > +{ > + unsigned int i = 0; > +#if defined RTE_HAS_SVE_ACLE > + svuint64_t v_src_table; > + svuint64_t v_dest_table; > + svbool_t pg = svwhilelt_b64(i, n); > + do { > + v_src_table = svld1_u64(pg, (uint64_t *)src_table + i); > + v_dest_table = svsub_x(pg, v_src_table, (uint64_t)ptr_base); > + v_dest_table = svlsr_x(pg, v_dest_table, bit_shift); > + svst1w(pg, &dest_table[i], v_dest_table); > + i += svcntd(); > + pg = svwhilelt_b64(i, n); > + } while (svptest_any(svptrue_b64(), pg)); > +#elif defined __ARM_NEON > + uint64_t ptr_diff; > + uint64x2_t v_src_table; > + uint64x2_t v_dest_table; > + /* right shift is done by left shifting by negative int */ > + int64x2_t v_shift = vdupq_n_s64(-bit_shift); > + uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base); > + for (; i < (n & ~0x1); i += 2) { > + v_src_table = vld1q_u64((const uint64_t *)src_table + i); > + v_dest_table = vsubq_u64(v_src_table, v_ptr_base); > + v_dest_table = vshlq_u64(v_dest_table, v_shift); > + vst1_u32(dest_table + i, vqmovn_u64(v_dest_table)); > + } > + /* process leftover single item in case of odd number of n */ > + if (unlikely(n & 0x1)) { > + ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base); > + dest_table[i] = (uint32_t) (ptr_diff >> bit_shift); > + } > +#else > + uint64_t ptr_diff; > + for (; i < n; i++) { > + ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base); > + /* save extra bits that are redundant due to alignment */ > + ptr_diff = ptr_diff >> bit_shift; > + /* make sure no truncation will happen when casting */ > + RTE_ASSERT(ptr_diff <= UINT32_MAX); > + dest_table[i] = (uint32_t) ptr_diff; > + } > +#endif > +} I see it is providing some per-CPU optimizations, so it is in favor of having it in DPDK. Other than that, it looks very generic, so it is questionable to have in DPDK.