27/09/2023 17:08, Paul Szczepanek:
> Add a new utility header for compressing pointers. Pointers are
> compressed by taking advantage of their locality. Instead of
> storing the full address only an offset from a known base is stored.

You probably need to insert some explanations from the cover letter.

> The provided functions can store pointers in 32bit offsets.
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
> Signed-off-by: Paul Szczepanek <paul.szczepa...@arm.com>
> Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.alig...@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
[...]
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -35,6 +35,7 @@ headers += files(
>          'rte_pci_dev_feature_defs.h',
>          'rte_pci_dev_features.h',
>          'rte_per_lcore.h',
> +     'rte_ptr_compress.h',
>          'rte_pflock.h',
>          'rte_random.h',
>          'rte_reciprocal.h',

Did you try to sort alphabetically? failed :)

> +#ifndef _RTE_PTR_COMPRESS_H_
> +#define _RTE_PTR_COMPRESS_H_

No need extra underscores.

> +
> +/**
> + * @file
> + * RTE pointer compression and decompression.

RTE has no mean here I think.

> + */
> +
> +#include <stdint.h>
> +#include <inttypes.h>
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_debug.h>
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Compress pointers into 32 bit offsets from base pointer.

I think it should be "32-bit".

> + *
> + * @note Offsets from the base pointer must fit within 32bits. Alignment 
> allows
> + * us to drop bits from the offsets - this means that for pointers aligned by
> + * 8 bytes they must be within 32GB of the base pointer. Unaligned pointers
> + * must be within 4GB.

Not clear what is "alignment".
> + *
> + * @param ptr_base
> + *   A pointer used to calculate offsets of pointers in src_table.
> + * @param src_table
> + *   A pointer to an array of pointers.
> + * @param dest_table
> + *   A pointer to an array of compressed pointers returned by this function.
> + * @param n
> + *   The number of objects to compress, must be strictly positive.
> + * @param bit_shift
> + *   Byte alignment of memory pointed to by the pointers allows for
> + *   bits to be dropped from the offset and hence widen the memory region 
> that
> + *   can be covered. This controls how many bits are right shifted.
> + **/
> +static __rte_always_inline void
> +rte_ptr_compress_32(void *ptr_base, void **src_table,
> +             uint32_t *dest_table, unsigned int n, unsigned int bit_shift)
> +{
> +     unsigned int i = 0;
> +#if defined RTE_HAS_SVE_ACLE
> +     svuint64_t v_src_table;
> +     svuint64_t v_dest_table;
> +     svbool_t pg = svwhilelt_b64(i, n);
> +     do {
> +             v_src_table = svld1_u64(pg, (uint64_t *)src_table + i);
> +             v_dest_table = svsub_x(pg, v_src_table, (uint64_t)ptr_base);
> +             v_dest_table = svlsr_x(pg, v_dest_table, bit_shift);
> +             svst1w(pg, &dest_table[i], v_dest_table);
> +             i += svcntd();
> +             pg = svwhilelt_b64(i, n);
> +     } while (svptest_any(svptrue_b64(), pg));
> +#elif defined __ARM_NEON
> +     uint64_t ptr_diff;
> +     uint64x2_t v_src_table;
> +     uint64x2_t v_dest_table;
> +     /* right shift is done by left shifting by negative int */
> +     int64x2_t v_shift = vdupq_n_s64(-bit_shift);
> +     uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
> +     for (; i < (n & ~0x1); i += 2) {
> +             v_src_table = vld1q_u64((const uint64_t *)src_table + i);
> +             v_dest_table = vsubq_u64(v_src_table, v_ptr_base);
> +             v_dest_table = vshlq_u64(v_dest_table, v_shift);
> +             vst1_u32(dest_table + i, vqmovn_u64(v_dest_table));
> +     }
> +     /* process leftover single item in case of odd number of n */
> +     if (unlikely(n & 0x1)) {
> +             ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
> +             dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
> +     }
> +#else
> +     uint64_t ptr_diff;
> +     for (; i < n; i++) {
> +             ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
> +             /* save extra bits that are redundant due to alignment */
> +             ptr_diff = ptr_diff >> bit_shift;
> +             /* make sure no truncation will happen when casting */
> +             RTE_ASSERT(ptr_diff <= UINT32_MAX);
> +             dest_table[i] = (uint32_t) ptr_diff;
> +     }
> +#endif
> +}

I see it is providing some per-CPU optimizations,
so it is in favor of having it in DPDK.
Other than that, it looks very generic, so it is questionable to have in DPDK.



Reply via email to