Re: [dpdk-dev] [PATCH v2] net/mlx: support firmware version query
Thursday, February 7, 2019 12:25 AM, Thomas Monjalon: > Cc: dev@dpdk.org > Subject: [PATCH v2] net/mlx: support firmware version query > > The API function rte_eth_dev_fw_version_get() is querying drivers via the > operation callback fw_version_get(). > The implementation of this operation is added for mlx4 and mlx5. > Both functions are copying the same ibverbs field fw_ver which is retrieved > when calling ibv_query_device[_ex]() during the port probing. > > It is tested with command "drvinfo" of examples/ethtool/. > > Signed-off-by: Thomas Monjalon Acked-by: Shahaf Shuler Applied to next-net-mlx, thanks.
Re: [dpdk-dev] [PATCH v3] net/mlx5: fix Tx metadata for multi-segment packet
Wednesday, January 30, 2019 9:24 AM, Yongseok Koh: > Subject: Re: [PATCH v3] net/mlx5: fix Tx metadata for multi-segment packet > > > On Jan 30, 2019, at 3:43 PM, Dekel Peled wrote: > > > > Original patch implemented the use of match_metadata offload in the > > different burst functions. > > The concurrent use of match_metadata and multi_segs offloads was not > > handled. > > > > This patch updates function txq_scatter_v(), to pass metadata value > > from mbuf to wqe, when indicated by offload flags. > > > > Fixes: 6bd7fbd03c62 ("net/mlx5: support metadata as flow rule > > criteria") > > Cc: sta...@dpdk.org > > > > Signed-off-by: Dekel Peled Applied to next-net-mlx, thanks . > > > > --- > > Acked-by: Yongseok Koh > > Thanks > > > v3: Update title, modify indentation. > > v2: Apply code review comments. > > --- > > --- > > drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 12 +--- > > drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 11 --- > > 2 files changed, 17 insertions(+), 6 deletions(-) > > > > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h > > b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h > > index 883fe1b..38e915c 100644 > > --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h > > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h > > @@ -104,6 +104,8 @@ > > sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; > > unsigned int n; > > volatile struct mlx5_wqe *wqe = NULL; > > + bool metadata_ol = > > + txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? > true : false; > > > > assert(elts_n > pkts_n); > > mlx5_tx_complete(txq); > > @@ -127,6 +129,9 @@ > > uint8x16_t *t_wqe; > > uint8_t *dseg; > > uint8x16_t ctrl; > > + rte_be32_t metadata = > > + metadata_ol && (buf->ol_flags & > PKT_TX_METADATA) ? > > + buf->tx_metadata : 0; > > > > assert(segs_n); > > max_elts = elts_n - (elts_head - txq->elts_tail); @@ -164,9 > +169,10 > > @@ > > ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m); > > vst1q_u8((void *)t_wqe, ctrl); > > /* Fill ESEG in the header. */ > > - vst1q_u16((void *)(t_wqe + 1), > > - ((uint16x8_t) { 0, 0, cs_flags, > rte_cpu_to_be_16(len), > > - 0, 0, 0, 0 })); > > + vst1q_u32((void *)(t_wqe + 1), > > + ((uint32x4_t){ 0, > > +cs_flags << 16 | > rte_cpu_to_be_16(len), > > +metadata, 0 })); > > txq->wqe_ci = wqe_ci; > > } > > if (!n) > > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > > b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > > index 14117c4..fb384ef 100644 > > --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > > @@ -104,6 +104,8 @@ > > sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; > > unsigned int n; > > volatile struct mlx5_wqe *wqe = NULL; > > + bool metadata_ol = > > + txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? > true : false; > > > > assert(elts_n > pkts_n); > > mlx5_tx_complete(txq); > > @@ -125,6 +127,9 @@ > > uint16_t max_wqe; > > __m128i *t_wqe, *dseg; > > __m128i ctrl; > > + rte_be32_t metadata = > > + metadata_ol && (buf->ol_flags & > PKT_TX_METADATA) ? > > + buf->tx_metadata : 0; > > > > assert(segs_n); > > max_elts = elts_n - (elts_head - txq->elts_tail); @@ -165,9 > +170,9 > > @@ > > _mm_store_si128(t_wqe, ctrl); > > /* Fill ESEG in the header. */ > > _mm_store_si128(t_wqe + 1, > > - _mm_set_epi16(0, 0, 0, 0, > > - rte_cpu_to_be_16(len), cs_flags, > > - 0, 0)); > > + _mm_set_epi32(0, metadata, > > + (rte_cpu_to_be_16(len) << 16) | > > + cs_flags, 0)); > > txq->wqe_ci = wqe_ci; > > } > > if (!n) > > -- > > 1.8.3.1 > >
[dpdk-dev] [PATCH v1] hash: optimize signature compare by using neon intrinsic
Implemented signature compare function based on neon intrinsic. Hash bulk lookup had 3% - 6% performance gain after optimization. Signed-off-by: Ruifeng Wang --- lib/librte_hash/rte_cuckoo_hash.c | 32 ++- lib/librte_hash/rte_cuckoo_hash.h | 1 + 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/lib/librte_hash/rte_cuckoo_hash.c b/lib/librte_hash/rte_cuckoo_hash.c index c01489ba5..5745a254f 100644 --- a/lib/librte_hash/rte_cuckoo_hash.c +++ b/lib/librte_hash/rte_cuckoo_hash.c @@ -26,6 +26,9 @@ #include #include #include +#if defined(RTE_ARCH_ARM64) +#include +#endif #include "rte_hash.h" #include "rte_cuckoo_hash.h" @@ -407,6 +410,10 @@ rte_hash_create(const struct rte_hash_parameters *params) if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; else +#elif defined(RTE_ARCH_ARM64) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + h->sig_cmp_fn = RTE_HASH_COMPARE_NEON; + else #endif h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; @@ -1578,10 +1585,15 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, enum rte_hash_sig_compare_function sig_cmp_fn) { unsigned int i; +#ifdef RTE_MACHINE_CPUFLAG_NEON + uint16x8_t vmat, vsig, x; + uint64x2_t x64; + int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1}; +#endif /* For match mask the first bit of every two bits indicates the match */ switch (sig_cmp_fn) { -#ifdef RTE_MACHINE_CPUFLAG_SSE2 +#if defined(RTE_MACHINE_CPUFLAG_SSE2) case RTE_HASH_COMPARE_SSE: /* Compare all signatures in the bucket */ *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( @@ -1594,6 +1606,24 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, (__m128i const *)sec_bkt->sig_current), _mm_set1_epi16(sig))); break; +#elif defined(RTE_MACHINE_CPUFLAG_NEON) + case RTE_HASH_COMPARE_NEON: + vsig = vld1q_dup_u16((uint16_t const *)&sig); + /* Compare all signatures in the primary bucket */ + vmat = vceqq_u16(vsig, + vld1q_u16((uint16_t const *)prim_bkt->sig_current)); + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift); + x64 = vpaddlq_u32(vpaddlq_u16(x)); + *prim_hash_matches = (uint32_t)(vgetq_lane_u64(x64, 0) + + vgetq_lane_u64(x64, 1)); + /* Compare all signatures in the secondary bucket */ + vmat = vceqq_u16(vsig, + vld1q_u16((uint16_t const *)sec_bkt->sig_current)); + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift); + x64 = vpaddlq_u32(vpaddlq_u16(x)); + *sec_hash_matches = (uint32_t)(vgetq_lane_u64(x64, 0) + + vgetq_lane_u64(x64, 1)); + break; #endif default: for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { diff --git a/lib/librte_hash/rte_cuckoo_hash.h b/lib/librte_hash/rte_cuckoo_hash.h index eacdaa8d4..0548c97f0 100644 --- a/lib/librte_hash/rte_cuckoo_hash.h +++ b/lib/librte_hash/rte_cuckoo_hash.h @@ -141,6 +141,7 @@ struct rte_hash_key { enum rte_hash_sig_compare_function { RTE_HASH_COMPARE_SCALAR = 0, RTE_HASH_COMPARE_SSE, + RTE_HASH_COMPARE_NEON, RTE_HASH_COMPARE_NUM }; -- 2.17.1
Re: [dpdk-dev] [EXT] [PATCH v1] hash: optimize signature compare by using neon intrinsic
On Mon, 2019-02-11 at 15:30 +0800, Ruifeng Wang wrote: > > --- > --- > Implemented signature compare function based on neon intrinsic. > Hash bulk lookup had 3% - 6% performance gain after optimization. > > Signed-off-by: Ruifeng Wang > --- > lib/librte_hash/rte_cuckoo_hash.c | 32 > ++- > lib/librte_hash/rte_cuckoo_hash.h | 1 + > 2 files changed, 32 insertions(+), 1 deletion(-) > > diff --git a/lib/librte_hash/rte_cuckoo_hash.c > b/lib/librte_hash/rte_cuckoo_hash.c > index c01489ba5..5745a254f 100644 > --- a/lib/librte_hash/rte_cuckoo_hash.c > +++ b/lib/librte_hash/rte_cuckoo_hash.c > @@ -26,6 +26,9 @@ > #include > #include > #include > +#if defined(RTE_ARCH_ARM64) > +#include > +#endif The use of rte_vector.h will remove the need for #if defined... > > #include "rte_hash.h" > #include "rte_cuckoo_hash.h" > @@ -407,6 +410,10 @@ rte_hash_create(const struct rte_hash_parameters > *params) > if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) > h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; > else > +#elif defined(RTE_ARCH_ARM64) > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) > + h->sig_cmp_fn = RTE_HASH_COMPARE_NEON; > + else > #endif > h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; > > @@ -1578,10 +1585,15 @@ compare_signatures(uint32_t > *prim_hash_matches, uint32_t *sec_hash_matches, > enum rte_hash_sig_compare_function sig_cmp_fn) > { > unsigned int i; > +#ifdef RTE_MACHINE_CPUFLAG_NEON > + uint16x8_t vmat, vsig, x; > + uint64x2_t x64; > + int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1}; > +#endif Is it possible move down the variable declaration? to avoid the need for #ifdef here > > /* For match mask the first bit of every two bits indicates the > match */ > switch (sig_cmp_fn) { > -#ifdef RTE_MACHINE_CPUFLAG_SSE2 > +#if defined(RTE_MACHINE_CPUFLAG_SSE2) > case RTE_HASH_COMPARE_SSE: > /* Compare all signatures in the bucket */ > *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( > @@ -1594,6 +1606,24 @@ compare_signatures(uint32_t > *prim_hash_matches, uint32_t *sec_hash_matches, > (__m128i const *)sec_bkt- > >sig_current), > _mm_set1_epi16(sig))); > break; > +#elif defined(RTE_MACHINE_CPUFLAG_NEON) > + case RTE_HASH_COMPARE_NEON: > + vsig = vld1q_dup_u16((uint16_t const *)&sig); > + /* Compare all signatures in the primary bucket */ > + vmat = vceqq_u16(vsig, > + vld1q_u16((uint16_t const *)prim_bkt- > >sig_current)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), > shift); > + x64 = vpaddlq_u32(vpaddlq_u16(x)); > + *prim_hash_matches = (uint32_t)(vgetq_lane_u64(x64, 0) > + > + vgetq_lane_u64(x64, 1)); > + /* Compare all signatures in the secondary bucket */ > + vmat = vceqq_u16(vsig, > + vld1q_u16((uint16_t const *)sec_bkt- > >sig_current)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), > shift); > + x64 = vpaddlq_u32(vpaddlq_u16(x)); > + *sec_hash_matches = (uint32_t)(vgetq_lane_u64(x64, 0) + > + vgetq_lane_u64(x64, 1)); > + break; > #endif > default: > for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > diff --git a/lib/librte_hash/rte_cuckoo_hash.h > b/lib/librte_hash/rte_cuckoo_hash.h > index eacdaa8d4..0548c97f0 100644 > --- a/lib/librte_hash/rte_cuckoo_hash.h > +++ b/lib/librte_hash/rte_cuckoo_hash.h > @@ -141,6 +141,7 @@ struct rte_hash_key { > enum rte_hash_sig_compare_function { > RTE_HASH_COMPARE_SCALAR = 0, > RTE_HASH_COMPARE_SSE, > + RTE_HASH_COMPARE_NEON, > RTE_HASH_COMPARE_NUM > }; >