> Add new lookup implementation for DIR24_8 algorithm using > AVX512 instruction set > > Signed-off-by: Vladimir Medvedkin <vladimir.medved...@intel.com> > --- > lib/librte_fib/Makefile | 14 ++++ > lib/librte_fib/dir24_8.c | 24 ++++++ > lib/librte_fib/dir24_8_avx512.c | 165 > ++++++++++++++++++++++++++++++++++++++++ > lib/librte_fib/dir24_8_avx512.h | 24 ++++++ > lib/librte_fib/meson.build | 11 +++ > lib/librte_fib/rte_fib.h | 3 +- > 6 files changed, 240 insertions(+), 1 deletion(-) > create mode 100644 lib/librte_fib/dir24_8_avx512.c > create mode 100644 lib/librte_fib/dir24_8_avx512.h > > diff --git a/lib/librte_fib/Makefile b/lib/librte_fib/Makefile > index 1dd2a49..3958da1 100644 > --- a/lib/librte_fib/Makefile > +++ b/lib/librte_fib/Makefile > @@ -19,4 +19,18 @@ SRCS-$(CONFIG_RTE_LIBRTE_FIB) := rte_fib.c rte_fib6.c > dir24_8.c trie.c > # install this header file > SYMLINK-$(CONFIG_RTE_LIBRTE_FIB)-include := rte_fib.h rte_fib6.h > > +CC_AVX512F_SUPPORT=$(shell $(CC) -mavx512f -dM -E - </dev/null 2>&1 | \ > +grep -q __AVX512F__ && echo 1) > + > +CC_AVX512DQ_SUPPORT=$(shell $(CC) -mavx512dq -dM -E - </dev/null 2>&1 | \ > +grep -q __AVX512DQ__ && echo 1) > + > +ifeq ($(CC_AVX512F_SUPPORT), 1) > + ifeq ($(CC_AVX512DQ_SUPPORT), 1) > + SRCS-$(CONFIG_RTE_LIBRTE_FIB) += dir24_8_avx512.c > + CFLAGS_dir24_8_avx512.o += -mavx512f > + CFLAGS_dir24_8_avx512.o += -mavx512dq > + CFLAGS_dir24_8.o += -DCC_DIR24_8_AVX512_SUPPORT > + endif > +endif > include $(RTE_SDK)/mk/rte.lib.mk > diff --git a/lib/librte_fib/dir24_8.c b/lib/librte_fib/dir24_8.c > index 9d74653..0a1c53f 100644 > --- a/lib/librte_fib/dir24_8.c > +++ b/lib/librte_fib/dir24_8.c > @@ -18,6 +18,12 @@ > #include <rte_fib.h> > #include "dir24_8.h" > > +#ifdef CC_DIR24_8_AVX512_SUPPORT > + > +#include "dir24_8_avx512.h" > + > +#endif /* CC_DIR24_8_AVX512_SUPPORT */ > + > #define DIR24_8_NAMESIZE 64 > > #define ROUNDUP(x, y) RTE_ALIGN_CEIL(x, (1 << (32 - y))) > @@ -62,6 +68,24 @@ dir24_8_get_lookup_fn(void *p, enum > rte_fib_dir24_8_lookup_type type) > } > case RTE_FIB_DIR24_8_SCALAR_UNI: > return dir24_8_lookup_bulk_uni; > +#ifdef CC_DIR24_8_AVX512_SUPPORT > + case RTE_FIB_DIR24_8_VECTOR: > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) <= 0) > + return NULL; > + > + switch (nh_sz) { > + case RTE_FIB_DIR24_8_1B: > + return rte_dir24_8_vec_lookup_bulk_1b; > + case RTE_FIB_DIR24_8_2B: > + return rte_dir24_8_vec_lookup_bulk_2b; > + case RTE_FIB_DIR24_8_4B: > + return rte_dir24_8_vec_lookup_bulk_4b; > + case RTE_FIB_DIR24_8_8B: > + return rte_dir24_8_vec_lookup_bulk_8b; > + default: > + return NULL; > + } > +#endif > default: > return NULL; > } > diff --git a/lib/librte_fib/dir24_8_avx512.c b/lib/librte_fib/dir24_8_avx512.c > new file mode 100644 > index 0000000..43dba28 > --- /dev/null > +++ b/lib/librte_fib/dir24_8_avx512.c > @@ -0,0 +1,165 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2020 Intel Corporation > + */ > + > +#include <rte_vect.h> > +#include <rte_fib.h> > + > +#include "dir24_8.h" > +#include "dir24_8_avx512.h" > + > +static __rte_always_inline void > +dir24_8_vec_lookup_x16(void *p, const uint32_t *ips, > + uint64_t *next_hops, int size) > +{ > + struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; > + __mmask16 msk_ext; > + __mmask16 exp_msk = 0x5555; > + __m512i ip_vec, idxes, res, bytes; > + const __m512i zero = _mm512_set1_epi32(0); > + const __m512i lsb = _mm512_set1_epi32(1); > + const __m512i lsbyte_msk = _mm512_set1_epi32(0xff); > + __m512i tmp1, tmp2, res_msk; > + __m256i tmp256; > + /* used to mask gather values if size is 1/2 (8/16 bit next hops) */ > + if (size == sizeof(uint8_t)) > + res_msk = _mm512_set1_epi32(UINT8_MAX); > + else if (size == sizeof(uint16_t)) > + res_msk = _mm512_set1_epi32(UINT16_MAX); > + > + ip_vec = _mm512_loadu_si512(ips); > + /* mask 24 most significant bits */ > + idxes = _mm512_srli_epi32(ip_vec, 8); > + > + /** > + * lookup in tbl24 > + * Put it inside branch to make compiler happy with -O0 > + */ > + if (size == sizeof(uint8_t)) { > + res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1); > + res = _mm512_and_epi32(res, res_msk); > + } else if (size == sizeof(uint16_t)) { > + res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2); > + res = _mm512_and_epi32(res, res_msk); > + } else > + res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4); > + > + /* get extended entries indexes */ > + msk_ext = _mm512_test_epi32_mask(res, lsb); > + > + if (msk_ext != 0) { > + idxes = _mm512_srli_epi32(res, 1); > + idxes = _mm512_slli_epi32(idxes, 8); > + bytes = _mm512_and_epi32(ip_vec, lsbyte_msk); > + idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes); > + if (size == sizeof(uint8_t)) { > + idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, > + idxes, (const int *)dp->tbl8, 1); > + idxes = _mm512_and_epi32(idxes, res_msk); > + } else if (size == sizeof(uint16_t)) { > + idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, > + idxes, (const int *)dp->tbl8, 2); > + idxes = _mm512_and_epi32(idxes, res_msk); > + } else > + idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, > + idxes, (const int *)dp->tbl8, 4); > + > + res = _mm512_mask_blend_epi32(msk_ext, res, idxes); > + } > + > + res = _mm512_srli_epi32(res, 1); > + tmp1 = _mm512_maskz_expand_epi32(exp_msk, res); > + tmp256 = _mm512_extracti32x8_epi32(res, 1); > + tmp2 = _mm512_maskz_expand_epi32(exp_msk, > + _mm512_castsi256_si512(tmp256)); > + _mm512_storeu_si512(next_hops, tmp1); > + _mm512_storeu_si512(next_hops + 8, tmp2); > +} > + > +static __rte_always_inline void > +dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips, > + uint64_t *next_hops) > +{ > + struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; > + const __m512i zero = _mm512_set1_epi32(0); > + const __m512i lsbyte_msk = _mm512_set1_epi64(0xff); > + const __m512i lsb = _mm512_set1_epi64(1); > + __m512i res, idxes, bytes; > + __m256i idxes_256, ip_vec; > + __mmask8 msk_ext; > + > + ip_vec = _mm256_loadu_si256((const void *)ips); > + /* mask 24 most significant bits */ > + idxes_256 = _mm256_srli_epi32(ip_vec, 8); > + > + /* lookup in tbl24 */ > + res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8); > + > + /* get extended entries indexes */ > + msk_ext = _mm512_test_epi64_mask(res, lsb); > + > + if (msk_ext != 0) { > + bytes = _mm512_cvtepi32_epi64(ip_vec); > + idxes = _mm512_srli_epi64(res, 1); > + idxes = _mm512_slli_epi64(idxes, 8); > + bytes = _mm512_and_epi64(bytes, lsbyte_msk); > + idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes); > + idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes, > + (const void *)dp->tbl8, 8); > + > + res = _mm512_mask_blend_epi64(msk_ext, res, idxes); > + } > + > + res = _mm512_srli_epi64(res, 1); > + _mm512_storeu_si512(next_hops, res); > +} > + > +void > +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n) > +{ > + uint32_t i; > + for (i = 0; i < (n / 16); i++) > + dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, > + sizeof(uint8_t)); > +
Just curious: if for reminder, instead of calling scalar lookup, Introduce a masked version of avx512 lookup - would it be slower? > + dir24_8_lookup_bulk_1b(p, ips + i * 16, next_hops + i * 16, > + n - i * 16); > +} > + > +void > +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n) > +{ > + uint32_t i; > + for (i = 0; i < (n / 16); i++) > + dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, > + sizeof(uint16_t)); > + > + dir24_8_lookup_bulk_2b(p, ips + i * 16, next_hops + i * 16, > + n - i * 16); > +} > + > +void > +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n) > +{ > + uint32_t i; > + for (i = 0; i < (n / 16); i++) > + dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, > + sizeof(uint32_t)); > + > + dir24_8_lookup_bulk_4b(p, ips + i * 16, next_hops + i * 16, > + n - i * 16); > +} > + > +void > +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n) > +{ > + uint32_t i; > + for (i = 0; i < (n / 8); i++) > + dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8); > + > + dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8); > +} > diff --git a/lib/librte_fib/dir24_8_avx512.h b/lib/librte_fib/dir24_8_avx512.h > new file mode 100644 > index 0000000..1d3c2b9 > --- /dev/null > +++ b/lib/librte_fib/dir24_8_avx512.h > @@ -0,0 +1,24 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2020 Intel Corporation > + */ > + > +#ifndef _DIR248_AVX512_H_ > +#define _DIR248_AVX512_H_ > + > +void > +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n); > + > +void > +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n); > + > +void > +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n); > + > +void > +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips, > + uint64_t *next_hops, const unsigned int n); > + > +#endif /* _DIR248_AVX512_H_ */ > diff --git a/lib/librte_fib/meson.build b/lib/librte_fib/meson.build > index 771828f..0963f3c 100644 > --- a/lib/librte_fib/meson.build > +++ b/lib/librte_fib/meson.build > @@ -5,3 +5,14 @@ > sources = files('rte_fib.c', 'rte_fib6.c', 'dir24_8.c', 'trie.c') > headers = files('rte_fib.h', 'rte_fib6.h') > deps += ['rib'] > + > +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f') > + if cc.has_argument('-mavx512dq') > + dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp', > + 'dir24_8_avx512.c', > + dependencies: static_rte_eal, > + c_args: cflags + ['-mavx512f'] + ['-mavx512dq']) > + objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c') > + cflags += '-DCC_DIR24_8_AVX512_SUPPORT' > + endif > +endif > diff --git a/lib/librte_fib/rte_fib.h b/lib/librte_fib/rte_fib.h > index db35685..2919d13 100644 > --- a/lib/librte_fib/rte_fib.h > +++ b/lib/librte_fib/rte_fib.h > @@ -54,7 +54,8 @@ enum rte_fib_dir24_8_nh_sz { > enum rte_fib_dir24_8_lookup_type { > RTE_FIB_DIR24_8_SCALAR_MACRO, > RTE_FIB_DIR24_8_SCALAR_INLINE, > - RTE_FIB_DIR24_8_SCALAR_UNI > + RTE_FIB_DIR24_8_SCALAR_UNI, > + RTE_FIB_DIR24_8_VECTOR > }; > > /** FIB configuration structure */ > -- Acked-by: Konstantin Ananyev <konstantin.anan...@intel.com> > 2.7.4