Add a new utility header for compressing pointers. The provided
functions can store pointers in 32-bit offsets.

The compression takes advantage of the fact that pointers are
usually located in a limited memory region (like a mempool).
We can compress them by converting them to offsets from a base
memory address. Offsets can be stored in fewer bytes (dictated
by the memory region size and alignment of the pointer).
For example: an 8 byte aligned pointer which is part of a 32GB
memory pool can be stored in 4 bytes.

Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
Signed-off-by: Paul Szczepanek <paul.szczepa...@arm.com>
Signed-off-by: Kamalakshitha Aligeri <kamalakshitha.alig...@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>


From one side the code itself is very small and straightforward,
from other side - it is not clear to me what is intended usage for it
within DPDK and it's applianances?
Konstantin


---
  .mailmap                           |   1 +
  lib/eal/include/meson.build        |   1 +
  lib/eal/include/rte_ptr_compress.h | 266 +++++++++++++++++++++++++++++
  3 files changed, 268 insertions(+)
  create mode 100644 lib/eal/include/rte_ptr_compress.h

diff --git a/.mailmap b/.mailmap
index 3f5bab26a8..004751d27a 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1069,6 +1069,7 @@ Paul Greenwalt <paul.greenw...@intel.com>
  Paulis Gributs <paulis.grib...@intel.com>
  Paul Luse <paul.e.l...@intel.com>
  Paul M Stillwell Jr <paul.m.stillwell...@intel.com>
+Paul Szczepanek <paul.szczepa...@arm.com>
  Pavan Kumar Linga <pavan.kumar.li...@intel.com>
  Pavan Nikhilesh <pbhagavat...@marvell.com> <pbhagavat...@caviumnetworks.com>
  Pavel Belous <pavel.bel...@aquantia.com>
diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index e94b056d46..ce2c733633 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -36,6 +36,7 @@ headers += files(
          'rte_pci_dev_features.h',
          'rte_per_lcore.h',
          'rte_pflock.h',
+       'rte_ptr_compress.h',
          'rte_random.h',
          'rte_reciprocal.h',
          'rte_seqcount.h',
diff --git a/lib/eal/include/rte_ptr_compress.h 
b/lib/eal/include/rte_ptr_compress.h
new file mode 100644
index 0000000000..47a72e4213
--- /dev/null
+++ b/lib/eal/include/rte_ptr_compress.h
@@ -0,0 +1,266 @@
+/* SPDX-License-Identifier: BSD-shift-Clause
+ * Copyright(c) 2023 Arm Limited
+ */
+
+#ifndef RTE_PTR_COMPRESS_H
+#define RTE_PTR_COMPRESS_H
+
+/**
+ * @file
+ * Pointer compression and decompression functions.
+ *
+ * When passing arrays full of pointers between threads, memory containing
+ * the pointers is copied multiple times which is especially costly between
+ * cores. These functions allow us to compress the pointers.
+ *
+ * Compression takes advantage of the fact that pointers are usually located in
+ * a limited memory region (like a mempool). We compress them by converting 
them
+ * to offsets from a base memory address. Offsets can be stored in fewer bytes.
+ *
+ * The compression functions come in two varieties: 32-bit and 16-bit.
+ *
+ * To determine how many bits are needed to compress the pointer calculate
+ * the biggest offset possible (highest value pointer - base pointer)
+ * and shift the value right according to alignment (shift by exponent of the
+ * power of 2 of alignment: aligned by 4 - shift by 2, aligned by 8 - shift by
+ * 3, etc.). The resulting value must fit in either 32 or 16 bits.
+ *
+ * For usage example and further explanation please see "Pointer Compression" 
in
+ * doc/guides/prog_guide/env_abstraction_layer.rst
+ */
+
+#include <stdint.h>
+#include <inttypes.h>
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Compress pointers into 32-bit offsets from base pointer.
+ *
+ * @note It is programmer's responsibility to ensure the resulting offsets fit
+ * into 32 bits. Alignment of the structures pointed to by the pointers allows
+ * us to drop bits from the offsets. This is controlled by the bit_shift
+ * parameter. This means that if structures are aligned by 8 bytes they must be
+ * within 32GB of the base pointer. If there is no such alignment guarantee 
they
+ * must be within 4GB.
+ *
+ * @param ptr_base
+ *   A pointer used to calculate offsets of pointers in src_table.
+ * @param src_table
+ *   A pointer to an array of pointers.
+ * @param dest_table
+ *   A pointer to an array of compressed pointers returned by this function.
+ * @param n
+ *   The number of objects to compress, must be strictly positive.
+ * @param bit_shift
+ *   Byte alignment of memory pointed to by the pointers allows for
+ *   bits to be dropped from the offset and hence widen the memory region that
+ *   can be covered. This controls how many bits are right shifted.
+ **/
+static __rte_always_inline void
+rte_ptr_compress_32(void *ptr_base, void **src_table,
+               uint32_t *dest_table, unsigned int n, unsigned int bit_shift)
+{
+       unsigned int i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+       svuint64_t v_ptr_table;
+       svbool_t pg = svwhilelt_b64(i, n);
+       do {
+               v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
+               v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
+               v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
+               svst1w(pg, &dest_table[i], v_ptr_table);
+               i += svcntd();
+               pg = svwhilelt_b64(i, n);
+       } while (svptest_any(svptrue_b64(), pg));
+#elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
+       uint64_t ptr_diff;
+       uint64x2_t v_ptr_table;
+       /* right shift is done by left shifting by negative int */
+       int64x2_t v_shift = vdupq_n_s64(-bit_shift);
+       uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
+       for (; i < (n & ~0x1); i += 2) {
+               v_ptr_table = vld1q_u64((const uint64_t *)src_table + i);
+               v_ptr_table = vsubq_u64(v_ptr_table, v_ptr_base);
+               v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
+               vst1_u32(dest_table + i, vqmovn_u64(v_ptr_table));
+       }
+       /* process leftover single item in case of odd number of n */
+       if (unlikely(n & 0x1)) {
+               ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+               dest_table[i] = (uint32_t) (ptr_diff >> bit_shift);
+       }
+#else
+       uintptr_t ptr_diff;
+       for (; i < n; i++) {
+               ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+               ptr_diff = ptr_diff >> bit_shift;
+               RTE_ASSERT(ptr_diff <= UINT32_MAX);
+               dest_table[i] = (uint32_t) ptr_diff;
+       }
+#endif
+}
+
+/**
+ * Decompress pointers from 32-bit offsets from base pointer.
+ *
+ * @param ptr_base
+ *   A pointer which was used to calculate offsets in src_table.
+ * @param src_table
+ *   A pointer to an array to compressed pointers.
+ * @param dest_table
+ *   A pointer to an array of decompressed pointers returned by this function.
+ * @param n
+ *   The number of objects to decompress, must be strictly positive.
+ * @param bit_shift
+ *   Byte alignment of memory pointed to by the pointers allows for
+ *   bits to be dropped from the offset and hence widen the memory region that
+ *   can be covered. This controls how many bits are left shifted when pointers
+ *   are recovered from the offsets.
+ **/
+static __rte_always_inline void
+rte_ptr_decompress_32(void *ptr_base, uint32_t *src_table,
+               void **dest_table, unsigned int n, unsigned int bit_shift)
+{
+       unsigned int i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+       svuint64_t v_ptr_table;
+       svbool_t pg = svwhilelt_b64(i, n);
+       do {
+               v_ptr_table = svld1uw_u64(pg, &src_table[i]);
+               v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
+               v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
+               svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
+               i += svcntd();
+               pg = svwhilelt_b64(i, n);
+       } while (svptest_any(svptrue_b64(), pg));
+#elif defined __ARM_NEON && !defined RTE_ARCH_ARMv8_AARCH32
+       uint64_t ptr_diff;
+       uint64x2_t v_ptr_table;
+       int64x2_t v_shift = vdupq_n_s64(bit_shift);
+       uint64x2_t v_ptr_base = vdupq_n_u64((uint64_t)ptr_base);
+       for (; i < (n & ~0x1); i += 2) {
+               v_ptr_table = vmovl_u32(vld1_u32(src_table + i));
+               v_ptr_table = vshlq_u64(v_ptr_table, v_shift);
+               v_ptr_table = vaddq_u64(v_ptr_table, v_ptr_base);
+               vst1q_u64((uint64_t *)dest_table + i, v_ptr_table);
+       }
+       /* process leftover single item in case of odd number of n */
+       if (unlikely(n & 0x1)) {
+               ptr_diff = ((uint64_t) src_table[i]) << bit_shift;
+               dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+       }
+#else
+       uintptr_t ptr_diff;
+       for (; i < n; i++) {
+               ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
+               dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+       }
+#endif
+}
+
+/**
+ * Compress pointers into 16-bit offsets from base pointer.
+ *
+ * @note It is programmer's responsibility to ensure the resulting offsets fit
+ * into 16 bits. Alignment of the structures pointed to by the pointers allows
+ * us to drop bits from the offsets. This is controlled by the bit_shift
+ * parameter. This means that if structures are aligned by 8 bytes they must be
+ * within 256KB of the base pointer. If there is no such alignment guarantee
+ * they must be within 64KB.
+ *
+ * @param ptr_base
+ *   A pointer used to calculate offsets of pointers in src_table.
+ * @param src_table
+ *   A pointer to an array of pointers.
+ * @param dest_table
+ *   A pointer to an array of compressed pointers returned by this function.
+ * @param n
+ *   The number of objects to compress, must be strictly positive.
+ * @param bit_shift
+ *   Byte alignment of memory pointed to by the pointers allows for
+ *   bits to be dropped from the offset and hence widen the memory region that
+ *   can be covered. This controls how many bits are right shifted.
+ **/
+static __rte_always_inline void
+rte_ptr_compress_16(void *ptr_base, void **src_table,
+               uint16_t *dest_table, unsigned int n, unsigned int bit_shift)
+{
+
+       unsigned int i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+       svuint64_t v_ptr_table;
+       svbool_t pg = svwhilelt_b64(i, n);
+       do {
+               v_ptr_table = svld1_u64(pg, (uint64_t *)src_table + i);
+               v_ptr_table = svsub_x(pg, v_ptr_table, (uint64_t)ptr_base);
+               v_ptr_table = svlsr_x(pg, v_ptr_table, bit_shift);
+               svst1h(pg, &dest_table[i], v_ptr_table);
+               i += svcntd();
+               pg = svwhilelt_b64(i, n);
+       } while (svptest_any(svptrue_b64(), pg));
+#else
+       uintptr_t ptr_diff;
+       for (; i < n; i++) {
+               ptr_diff = RTE_PTR_DIFF(src_table[i], ptr_base);
+               ptr_diff = ptr_diff >> bit_shift;
+               RTE_ASSERT(ptr_diff <= UINT16_MAX);
+               dest_table[i] = (uint16_t) ptr_diff;
+       }
+#endif
+}
+
+/**
+ * Decompress pointers from 16-bit offsets from base pointer.
+ *
+ * @param ptr_base
+ *   A pointer which was used to calculate offsets in src_table.
+ * @param src_table
+ *   A pointer to an array to compressed pointers.
+ * @param dest_table
+ *   A pointer to an array of decompressed pointers returned by this function.
+ * @param n
+ *   The number of objects to decompress, must be strictly positive.
+ * @param bit_shift
+ *   Byte alignment of memory pointed to by the pointers allows for
+ *   bits to be dropped from the offset and hence widen the memory region that
+ *   can be covered. This controls how many bits are left shifted when pointers
+ *   are recovered from the offsets.
+ **/
+static __rte_always_inline void
+rte_ptr_decompress_16(void *ptr_base, uint16_t *src_table,
+               void **dest_table, unsigned int n, unsigned int bit_shift)
+{
+       unsigned int i = 0;
+#if defined RTE_HAS_SVE_ACLE && !defined RTE_ARCH_ARMv8_AARCH32
+       svuint64_t v_ptr_table;
+       svbool_t pg = svwhilelt_b64(i, n);
+       do {
+               v_ptr_table = svld1uh_u64(pg, &src_table[i]);
+               v_ptr_table = svlsl_x(pg, v_ptr_table, bit_shift);
+               v_ptr_table = svadd_x(pg, v_ptr_table, (uint64_t)ptr_base);
+               svst1(pg, (uint64_t *)dest_table + i, v_ptr_table);
+               i += svcntd();
+               pg = svwhilelt_b64(i, n);
+       } while (svptest_any(svptrue_b64(), pg));
+#else
+       uintptr_t ptr_diff;
+       for (; i < n; i++) {
+               ptr_diff = ((uintptr_t) src_table[i]) << bit_shift;
+               dest_table[i] = RTE_PTR_ADD(ptr_base, ptr_diff);
+       }
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_PTR_COMPRESS_H */
--
2.25.1


Reply via email to