Re: [dpdk-dev] [PATCH v3 3/8] crypto/armv8: add PMD optimized for ARMv8 processors

2017-01-05 Thread Jianbo Liu
On 5 January 2017 at 01:33,   wrote:
> From: Zbigniew Bodek 
>
> This patch introduces crypto poll mode driver
> using ARMv8 cryptographic extensions.
> CPU compatibility with this driver is detected in
> run-time and virtual crypto device will not be
> created if CPU doesn't provide:
> AES, SHA1, SHA2 and NEON.
>
> This PMD is optimized to provide performance boost
> for chained crypto operations processing,
> such as encryption + HMAC generation,
> decryption + HMAC validation. In particular,
> cipher only or hash only operations are
> not provided.
>
> The driver currently supports AES-128-CBC
> in combination with: SHA256 HMAC and SHA1 HMAC
> and relies on the external armv8_crypto library:
> https://github.com/caviumnetworks/armv8_crypto
>

It's standalone lib. I think you should change the following line in
its Makefile, so not depend on DPDK.
"include $(RTE_SDK)/mk/rte.lib.mk"

> This patch adds driver's code only and does
> not include it in the build system.
>
> Signed-off-by: Zbigniew Bodek 
> ---
>  drivers/crypto/armv8/Makefile  |  73 ++
>  drivers/crypto/armv8/rte_armv8_pmd.c   | 926 
> +
>  drivers/crypto/armv8/rte_armv8_pmd_ops.c   | 369 ++
>  drivers/crypto/armv8/rte_armv8_pmd_private.h   | 211 ++
>  drivers/crypto/armv8/rte_armv8_pmd_version.map |   3 +
>  5 files changed, 1582 insertions(+)
>  create mode 100644 drivers/crypto/armv8/Makefile
>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd.c
>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd_ops.c
>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd_private.h
>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd_version.map
>
> diff --git a/drivers/crypto/armv8/Makefile b/drivers/crypto/armv8/Makefile
> new file mode 100644
> index 000..dc5ea02
> --- /dev/null
> +++ b/drivers/crypto/armv8/Makefile
> @@ -0,0 +1,73 @@
> +#
> +#   BSD LICENSE
> +#
> +#   Copyright (C) Cavium networks Ltd. 2017.
> +#
> +#   Redistribution and use in source and binary forms, with or without
> +#   modification, are permitted provided that the following conditions
> +#   are met:
> +#
> +# * Redistributions of source code must retain the above copyright
> +#   notice, this list of conditions and the following disclaimer.
> +# * Redistributions in binary form must reproduce the above copyright
> +#   notice, this list of conditions and the following disclaimer in
> +#   the documentation and/or other materials provided with the
> +#   distribution.
> +# * Neither the name of Cavium networks nor the names of its
> +#   contributors may be used to endorse or promote products derived
> +#   from this software without specific prior written permission.
> +#
> +#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> +#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> +#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> +#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> +#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> +#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> +#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> +#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +#
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +ifneq ($(MAKECMDGOALS),clean)
> +ifneq ($(MAKECMDGOALS),config)
> +ifeq ($(ARMV8_CRYPTO_LIB_PATH),)
> +$(error "Please define ARMV8_CRYPTO_LIB_PATH environment variable")
> +endif
> +endif
> +endif
> +
> +# library name
> +LIB = librte_pmd_armv8.a
> +
> +# build flags
> +CFLAGS += -O3
> +CFLAGS += $(WERROR_FLAGS)
> +CFLAGS += -L$(RTE_SDK)/../openssl -I$(RTE_SDK)/../openssl/include

Is it really needed?

> +
> +# library version
> +LIBABIVER := 1
> +
> +# versioning export map
> +EXPORT_MAP := rte_armv8_pmd_version.map
> +
> +# external library dependencies
> +CFLAGS += -I$(ARMV8_CRYPTO_LIB_PATH)
> +CFLAGS += -I$(ARMV8_CRYPTO_LIB_PATH)/asm/include
> +LDLIBS += -L$(ARMV8_CRYPTO_LIB_PATH) -larmv8_crypto
> +
> +# library source files
> +SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += rte_armv8_pmd.c
> +SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += rte_armv8_pmd_ops.c
> +
> +# library dependencies
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_eal
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_mbuf
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_mempool
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_ring
> +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_cryptodev
> +
> +include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/drivers/crypto/armv8/rte_armv8_pmd.c 

Re: [dpdk-dev] [PATCH v3 3/8] crypto/armv8: add PMD optimized for ARMv8 processors

2017-01-12 Thread Jianbo Liu
On 12 January 2017 at 21:12, Zbigniew Bodek
 wrote:
> Hello  Jianbo Liu,
>
> Thanks for the review. Please check my answers in-line.
>
> Kind regards
> Zbigniew
>
>
> On 06.01.2017 03:45, Jianbo Liu wrote:
>>
>> On 5 January 2017 at 01:33,   wrote:
>>>
>>> From: Zbigniew Bodek 
>>>
>>> This patch introduces crypto poll mode driver
>>> using ARMv8 cryptographic extensions.
>>> CPU compatibility with this driver is detected in
>>> run-time and virtual crypto device will not be
>>> created if CPU doesn't provide:
>>> AES, SHA1, SHA2 and NEON.
>>>
>>> This PMD is optimized to provide performance boost
>>> for chained crypto operations processing,
>>> such as encryption + HMAC generation,
>>> decryption + HMAC validation. In particular,
>>> cipher only or hash only operations are
>>> not provided.
>>>
>>> The driver currently supports AES-128-CBC
>>> in combination with: SHA256 HMAC and SHA1 HMAC
>>> and relies on the external armv8_crypto library:
>>> https://github.com/caviumnetworks/armv8_crypto
>>>
>>
>> It's standalone lib. I think you should change the following line in
>> its Makefile, so not depend on DPDK.
>> "include $(RTE_SDK)/mk/rte.lib.mk"
>>
>>> This patch adds driver's code only and does
>>> not include it in the build system.
>>>
>>> Signed-off-by: Zbigniew Bodek 
>>> ---
>>>  drivers/crypto/armv8/Makefile  |  73 ++
>>>  drivers/crypto/armv8/rte_armv8_pmd.c   | 926
>>> +
>>>  drivers/crypto/armv8/rte_armv8_pmd_ops.c   | 369 ++
>>>  drivers/crypto/armv8/rte_armv8_pmd_private.h   | 211 ++
>>>  drivers/crypto/armv8/rte_armv8_pmd_version.map |   3 +
>>>  5 files changed, 1582 insertions(+)
>>>  create mode 100644 drivers/crypto/armv8/Makefile
>>>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd.c
>>>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd_ops.c
>>>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd_private.h
>>>  create mode 100644 drivers/crypto/armv8/rte_armv8_pmd_version.map
>>>
.

>>> +   /* Select auth algo */
>>> +   switch (auth_xform->auth.algo) {
>>> +   /* Cover supported hash algorithms */
>>> +   case RTE_CRYPTO_AUTH_SHA256:
>>> +   aalg = auth_xform->auth.algo;
>>> +   sess->auth.mode = ARMV8_CRYPTO_AUTH_AS_AUTH;
>>> +   break;
>>> +   case RTE_CRYPTO_AUTH_SHA1_HMAC:
>>> +   case RTE_CRYPTO_AUTH_SHA256_HMAC: /* Fall through */
>>> +   aalg = auth_xform->auth.algo;
>>> +   sess->auth.mode = ARMV8_CRYPTO_AUTH_AS_HMAC;
>>> +   break;
>>> +   default:
>>> +   return -EINVAL;
>>> +   }
>>> +
>>> +   /* Verify supported key lengths and extract proper algorithm */
>>> +   switch (cipher_xform->cipher.key.length << 3) {
>>> +   case 128:
>>> +   sess->crypto_func =
>>> +   CRYPTO_GET_ALGO(order, cop, calg, aalg,
>>> 128);
>>> +   sess->cipher.key_sched =
>>> +   CRYPTO_GET_KEY_SCHED(cop, calg, 128);
>>> +   break;
>>> +   case 192:
>>> +   sess->crypto_func =
>>> +   CRYPTO_GET_ALGO(order, cop, calg, aalg,
>>> 192);
>>> +   sess->cipher.key_sched =
>>> +   CRYPTO_GET_KEY_SCHED(cop, calg, 192);
>>> +   break;
>>> +   case 256:
>>> +   sess->crypto_func =
>>> +   CRYPTO_GET_ALGO(order, cop, calg, aalg,
>>> 256);
>>> +   sess->cipher.key_sched =
>>> +   CRYPTO_GET_KEY_SCHED(cop, calg, 256);
>>> +   break;
>>> +   default:
>>> +   sess->crypto_func = NULL;
>>> +   sess->cipher.key_sched = NULL;
>>> +   return -EINVAL;
>>> +   }
>>> +
>>> +   if (unlikely(sess->crypto_func == NULL)) {
>>> +   /*
>>> +* If we got here that means that there must be a bug
>>
>>
>> Since

Re: [dpdk-dev] [PATCH v3 2/8] lib: add cryptodev type for the upcoming ARMv8 PMD

2017-01-15 Thread Jianbo Liu
On 13 January 2017 at 16:16, Hemant Agrawal  wrote:
> On 1/4/2017 11:03 PM, zbigniew.bo...@caviumnetworks.com wrote:
>>
>> From: Zbigniew Bodek 
>>
>> Add type and name for ARMv8 crypto PMD
>>
>> Signed-off-by: Zbigniew Bodek 
>> ---
>>  lib/librte_cryptodev/rte_cryptodev.h | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/lib/librte_cryptodev/rte_cryptodev.h
>> b/lib/librte_cryptodev/rte_cryptodev.h
>> index 8f63e8f..6f34f22 100644
>> --- a/lib/librte_cryptodev/rte_cryptodev.h
>> +++ b/lib/librte_cryptodev/rte_cryptodev.h
>> @@ -66,6 +66,8 @@
>>  /**< KASUMI PMD device name */
>>  #define CRYPTODEV_NAME_ZUC_PMD crypto_zuc
>>  /**< KASUMI PMD device name */
>> +#define CRYPTODEV_NAME_ARMV8_PMD   crypto_armv8
>> +/**< ARMv8 Crypto PMD device name */
>>
> I will suggest the name as armv8ce or armv8_ce for this driver.
> Do you agree?
>

I don't because it's a lib only optimized for chained crypto and hash.

>
>>  /** Crypto device type */
>>  enum rte_cryptodev_type {
>> @@ -77,6 +79,7 @@ enum rte_cryptodev_type {
>> RTE_CRYPTODEV_KASUMI_PMD,   /**< KASUMI PMD */
>> RTE_CRYPTODEV_ZUC_PMD,  /**< ZUC PMD */
>> RTE_CRYPTODEV_OPENSSL_PMD,/**<  OpenSSL PMD */
>> +   RTE_CRYPTODEV_ARMV8_PMD,/**< ARMv8 crypto PMD */
>>  };
>>
>>  extern const char **rte_cyptodev_names;
>>
>
>


Re: [dpdk-dev] [PATCH 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-02 Thread Jianbo Liu
On 2 February 2017 at 00:19, Ananyev, Konstantin
 wrote:
> Hi,
>
>> -Original Message-----
>> From: Jianbo Liu [mailto:jianbo@linaro.org]
>> Sent: Monday, December 19, 2016 6:09 AM
>> To: dev@dpdk.org; Zhang, Helin ; Ananyev, Konstantin 
>> ;
>> jerin.ja...@caviumnetworks.com
>> Cc: Jianbo Liu 
>> Subject: [PATCH 1/2] net/ixgbe: calculate the correct number of received 
>> packets in bulk alloc function
>>
>> To get better performance, Rx bulk alloc recv function will scan 8 
>> descriptors
>> in one time, but the statuses are not consistent on ARM platform because
>> the memory allocated for Rx descriptors is cacheable hugepages.
>> This patch is to calculate the number of received packets by scanning DD bit
>> sequentially, and stops when meeting the first packet with DD bit unset.
>>
>> Signed-off-by: Jianbo Liu 
>> ---
>>  drivers/net/ixgbe/ixgbe_rxtx.c | 12 
>>  1 file changed, 8 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
>> index b2d9f45..2866bdb 100644
>> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
>> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
>> @@ -1402,17 +1402,21 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
>>   for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
>>i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
>>   /* Read desc statuses backwards to avoid race condition */
>> - for (j = LOOK_AHEAD-1; j >= 0; --j)
>> + for (j = LOOK_AHEAD - 1; j >= 0; --j) {
>>   s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
>> -
>> - for (j = LOOK_AHEAD - 1; j >= 0; --j)
>>   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
>>  lo_dword.data);
>> + }
>> +
>> + rte_smp_rmb();
>
> If reads can be reordered, shouldn't we fill pkt_info[] after smp_rmb() here?

The barrier is to forbid the reordering from the following readings,
which will count the number of actual received packets.
And as wb.uper and wb.lower of one descriptor are in the same
cacheline, could it be better to read them at the same time?.

> As another nit - with rmb() in and because you are looking the first gap in 
> s[] now,
> no need to read TXDs in backward order.

Reading backward is just to keep as it is for x86 platform.

> How it looks to me (as a suggestion):
>
> for (j = 0; j != LOOK_AHEAD; j++)
> s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
>
> rte_smp_rmb();
>
> for (j = 0; j < LOOK_AHEAD && (s[j] & IXGBE_RXDADV_STAT_DD) != 0; j++)
> ;
>
> for (j = 0; j < nb_dd; ++j) {
> pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.lo_dword.data);
>
>
> Konstantin
>
>
>>
>>   /* Compute how many status bits were set */
>>   nb_dd = 0;
>>   for (j = 0; j < LOOK_AHEAD; ++j)
>> - nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
>> + if (s[j] & IXGBE_RXDADV_STAT_DD)
>> + ++nb_dd;
>> + else
>> + break;
>>
>>   nb_rx += nb_dd;
>>
>> --
>> 2.4.11
>


Re: [dpdk-dev] [PATCH 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-03 Thread Jianbo Liu
On 3 February 2017 at 19:38, Ananyev, Konstantin
 wrote:
>
>
>> -Original Message-----
>> From: Jianbo Liu [mailto:jianbo@linaro.org]
>> Sent: Friday, February 3, 2017 6:22 AM
>> To: Ananyev, Konstantin 
>> Cc: dev@dpdk.org; Zhang, Helin ; 
>> jerin.ja...@caviumnetworks.com
>> Subject: Re: [PATCH 1/2] net/ixgbe: calculate the correct number of received 
>> packets in bulk alloc function
>>
>> On 2 February 2017 at 00:19, Ananyev, Konstantin
>>  wrote:
>> > Hi,
>> >
>> >> -Original Message-
>> >> From: Jianbo Liu [mailto:jianbo@linaro.org]
>> >> Sent: Monday, December 19, 2016 6:09 AM
>> >> To: dev@dpdk.org; Zhang, Helin ; Ananyev, 
>> >> Konstantin ;
>> >> jerin.ja...@caviumnetworks.com
>> >> Cc: Jianbo Liu 
>> >> Subject: [PATCH 1/2] net/ixgbe: calculate the correct number of received 
>> >> packets in bulk alloc function
>> >>
>> >> To get better performance, Rx bulk alloc recv function will scan 8 
>> >> descriptors
>> >> in one time, but the statuses are not consistent on ARM platform because
>> >> the memory allocated for Rx descriptors is cacheable hugepages.
>> >> This patch is to calculate the number of received packets by scanning DD 
>> >> bit
>> >> sequentially, and stops when meeting the first packet with DD bit unset.
>> >>
>> >> Signed-off-by: Jianbo Liu 
>> >> ---
>> >>  drivers/net/ixgbe/ixgbe_rxtx.c | 12 
>> >>  1 file changed, 8 insertions(+), 4 deletions(-)
>> >>
>> >> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c 
>> >> b/drivers/net/ixgbe/ixgbe_rxtx.c
>> >> index b2d9f45..2866bdb 100644
>> >> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
>> >> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
>> >> @@ -1402,17 +1402,21 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
>> >>   for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
>> >>i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
>> >>   /* Read desc statuses backwards to avoid race condition */
>> >> - for (j = LOOK_AHEAD-1; j >= 0; --j)
>> >> + for (j = LOOK_AHEAD - 1; j >= 0; --j) {
>> >>   s[j] = 
>> >> rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
>> >> -
>> >> - for (j = LOOK_AHEAD - 1; j >= 0; --j)
>> >>   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
>> >>  lo_dword.data);
>> >> + }
>> >> +
>> >> + rte_smp_rmb();
>> >
>> > If reads can be reordered, shouldn't we fill pkt_info[] after smp_rmb() 
>> > here?
>>
>> The barrier is to forbid the reordering from the following readings,
>> which will count the number of actual received packets.
>
> What I meant is that if you'll keep reading from both rxdp[].wb.lower and 
> rxdp[].wb.upper
> before rmb, then nothing would prevent cpu from reorder these reads in any 
> way it likes
> (if we are talking about cpus with read reordering allowed), right?
> So it can end up with the following order:
>
> rxdp[N].wb.lower
> rxdp[N].wb.upper
>
> or even:
>
> rxdp[N-1].wb.lower
> rxdp[N].wb.lower
> rxdp[N-1].wb.upper
> rxdp[N].wb.upper
>
> In such cases pkt_info[] may contain invalid data.

Yes, it's possible. I'll send v2.

Thanks!

>
>> And as wb.uper and wb.lower of one descriptor are in the same
>> cacheline, could it be better to read them at the same time?.
>
> It could be, but I think for the sake of data integrity we have to make sure 
> that
> cpu would never read any other RXD field before wb.upper. status_error, see 
> above.
>
> BTW, the following code might re-read both wb.upper and wb.lower anyway.
> So I don't think you'll save many cycles here anyway.
>
>>
>> > As another nit - with rmb() in and because you are looking the first gap 
>> > in s[] now,
>> > no need to read TXDs in backward order.
>>
>> Reading backward is just to keep as it is for x86 platform.
>
> With the change you introducing, I don't think it is necessary any more.
>
> Konstantin
>
>>
>> > How it looks to me (as a suggestion):
>> >
>> > for (j = 0; j != LOOK_AHEAD; j++)
>> > s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
>> >
>> > rte_smp_rmb();
>> >
>> > for (j = 0; j < LOOK_AHEAD && (s[j] & IXGBE_RXDADV_STAT_DD) != 0; j++)
>> > ;
>> >
>> > for (j = 0; j < nb_dd; ++j) {
>> > pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.lo_dword.data);
>> >
>> >
>> > Konstantin
>> >
>> >
>> >>
>> >>   /* Compute how many status bits were set */
>> >>   nb_dd = 0;
>> >>   for (j = 0; j < LOOK_AHEAD; ++j)
>> >> - nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
>> >> + if (s[j] & IXGBE_RXDADV_STAT_DD)
>> >> + ++nb_dd;
>> >> + else
>> >> + break;
>> >>
>> >>   nb_rx += nb_dd;
>> >>
>> >> --
>> >> 2.4.11
>> >


[dpdk-dev] [PATCH v2 2/2] net/ixgbe: calculate correct number of received packets for ARM NEON-version vPMD

2017-02-04 Thread Jianbo Liu
vPMD will check 4 descs in one time, but the statuses are not consistent
because the memory allocated for RX descriptors is cacheable huagepage.
This patch is to calculate the number of received packets by scann DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c 
b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index f96cc85..0b1338d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -196,7 +196,6 @@
struct ixgbe_rx_entry *sw_ring;
uint16_t nb_pkts_recd;
int pos;
-   uint64_t var;
uint8x16_t shuf_msk = {
0xFF, 0xFF,
0xFF, 0xFF,  /* skip 32 bits pkt_type */
@@ -255,6 +254,7 @@
uint64x2_t mbp1, mbp2;
uint8x16_t staterr;
uint16x8_t tmp;
+   uint32_t var = 0;
uint32_t stat;
 
/* B.1 load 1 mbuf point */
@@ -349,11 +349,19 @@
vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
 pkt_mb1);
 
+   stat &= IXGBE_VPMD_DESC_DD_MASK;
+
/* C.4 calc avaialbe number of desc */
-   var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
-   nb_pkts_recd += var;
-   if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+   if (likely(var != IXGBE_VPMD_DESC_DD_MASK)) {
+   while (stat & 0x01) {
+   ++var;
+   stat = stat >> 8;
+   }
+   nb_pkts_recd += var;
break;
+   } else {
+   nb_pkts_recd += RTE_IXGBE_DESCS_PER_LOOP;
+   }
}
 
/* Update our internal tail pointer */
-- 
1.8.3.1



[dpdk-dev] [PATCH v2 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-04 Thread Jianbo Liu
To get better performance, Rx bulk alloc recv function will scan 8 descs
in one time, but the statuses are not consistent on ARM platform because
the memory allocated for Rx descriptors is cacheable hugepages.
This patch is to calculate the number of received packets by scan DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 36f1c02..613890e 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1460,17 +1460,19 @@ static inline int __attribute__((always_inline))
for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
 i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
/* Read desc statuses backwards to avoid race condition */
-   for (j = LOOK_AHEAD-1; j >= 0; --j)
+   for (j = 0; j < LOOK_AHEAD; j++)
s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
 
-   for (j = LOOK_AHEAD - 1; j >= 0; --j)
-   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
-  lo_dword.data);
+   rte_smp_rmb();
 
/* Compute how many status bits were set */
-   nb_dd = 0;
-   for (j = 0; j < LOOK_AHEAD; ++j)
-   nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
+   for (nb_dd = 0; nb_dd < LOOK_AHEAD &&
+   (s[nb_dd] & IXGBE_RXDADV_STAT_DD); nb_dd++)
+   ;
+
+   for (j = 0; j < nb_dd; j++)
+   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
+  lo_dword.data);
 
nb_rx += nb_dd;
 
-- 
1.8.3.1



[dpdk-dev] [PATCH v3 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-04 Thread Jianbo Liu
To get better performance, Rx bulk alloc recv function will scan 8 descs
in one time, but the statuses are not consistent on ARM platform because
the memory allocated for Rx descriptors is cacheable hugepages.
This patch is to calculate the number of received packets by scan DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 36f1c02..613890e 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1460,17 +1460,19 @@ static inline int __attribute__((always_inline))
for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
 i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
/* Read desc statuses backwards to avoid race condition */
-   for (j = LOOK_AHEAD-1; j >= 0; --j)
+   for (j = 0; j < LOOK_AHEAD; j++)
s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
 
-   for (j = LOOK_AHEAD - 1; j >= 0; --j)
-   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
-  lo_dword.data);
+   rte_smp_rmb();
 
/* Compute how many status bits were set */
-   nb_dd = 0;
-   for (j = 0; j < LOOK_AHEAD; ++j)
-   nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
+   for (nb_dd = 0; nb_dd < LOOK_AHEAD &&
+   (s[nb_dd] & IXGBE_RXDADV_STAT_DD); nb_dd++)
+   ;
+
+   for (j = 0; j < nb_dd; j++)
+   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
+  lo_dword.data);
 
nb_rx += nb_dd;
 
-- 
1.8.3.1



[dpdk-dev] [PATCH v3 2/2] net/ixgbe: calculate correct number of received packets for ARM NEON-version vPMD

2017-02-04 Thread Jianbo Liu
vPMD will check 4 descs in one time, but the statuses are not consistent
because the memory allocated for RX descriptors is cacheable huagepage.
This patch is to calculate the number of received packets by scann DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c 
b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index f96cc85..2a61322 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -196,7 +196,6 @@
struct ixgbe_rx_entry *sw_ring;
uint16_t nb_pkts_recd;
int pos;
-   uint64_t var;
uint8x16_t shuf_msk = {
0xFF, 0xFF,
0xFF, 0xFF,  /* skip 32 bits pkt_type */
@@ -255,15 +254,15 @@
uint64x2_t mbp1, mbp2;
uint8x16_t staterr;
uint16x8_t tmp;
+   uint32_t var = 0;
uint32_t stat;
 
/* B.1 load 1 mbuf point */
mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
 
-   /* Read desc statuses backwards to avoid race condition */
-   /* A.1 load 4 pkts desc */
-   descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
-   rte_rmb();
+   /* A.1 load 1 pkts desc */
+   descs[0] =  vld1q_u64((uint64_t *)(rxdp));
+   rte_smp_rmb();
 
/* B.2 copy 2 mbuf point into rx_pkts  */
vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
@@ -271,10 +270,11 @@
/* B.1 load 1 mbuf point */
mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
 
-   descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
-   /* B.1 load 2 mbuf point */
descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
-   descs[0] =  vld1q_u64((uint64_t *)(rxdp));
+
+   /* A.1 load 2 pkts descs */
+   descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
+   descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
 
/* B.2 copy 2 mbuf point into rx_pkts  */
vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
@@ -349,11 +349,19 @@
vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
 pkt_mb1);
 
+   stat &= IXGBE_VPMD_DESC_DD_MASK;
+
/* C.4 calc avaialbe number of desc */
-   var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
-   nb_pkts_recd += var;
-   if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+   if (likely(stat != IXGBE_VPMD_DESC_DD_MASK)) {
+   while (stat & 0x01) {
+   ++var;
+   stat = stat >> 8;
+   }
+   nb_pkts_recd += var;
break;
+   } else {
+   nb_pkts_recd += RTE_IXGBE_DESCS_PER_LOOP;
+   }
}
 
/* Update our internal tail pointer */
-- 
1.8.3.1



Re: [dpdk-dev] [PATCH v3 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-04 Thread Jianbo Liu
On 5 February 2017 at 00:37, Jianbo Liu  wrote:
> To get better performance, Rx bulk alloc recv function will scan 8 descs
> in one time, but the statuses are not consistent on ARM platform because
> the memory allocated for Rx descriptors is cacheable hugepages.
> This patch is to calculate the number of received packets by scan DD bit
> sequentially, and stops when meeting the first packet with DD bit unset.
>
> Signed-off-by: Jianbo Liu 
> ---
>  drivers/net/ixgbe/ixgbe_rxtx.c | 16 +---
>  1 file changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 36f1c02..613890e 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c

There is no change for this patch from v2 to v3.
But the other in this patchset, reading desc statuses is changed to be
in order, not backward.


Re: [dpdk-dev] [PATCH v2 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-08 Thread Jianbo Liu
On 9 February 2017 at 03:53, Ananyev, Konstantin
 wrote:
>
>
>> -Original Message-
>> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Ananyev, Konstantin
>> Sent: Wednesday, February 8, 2017 6:54 PM
>> To: Yigit, Ferruh ; Jianbo Liu 
>> ; dev@dpdk.org; Zhang, Helin ;
>> jerin.ja...@caviumnetworks.com
>> Subject: Re: [dpdk-dev] [PATCH v2 1/2] net/ixgbe: calculate the correct 
>> number of received packets in bulk alloc function
>>
>> Hi Ferruh,
>>
>> >
>> > On 2/4/2017 1:26 PM, Ananyev, Konstantin wrote:
>> > >>
>> > >> To get better performance, Rx bulk alloc recv function will scan 8 descs
>> > >> in one time, but the statuses are not consistent on ARM platform because
>> > >> the memory allocated for Rx descriptors is cacheable hugepages.
>> > >> This patch is to calculate the number of received packets by scan DD bit
>> > >> sequentially, and stops when meeting the first packet with DD bit unset.
>> > >>
>> > >> Signed-off-by: Jianbo Liu 
>> > >> ---
>> > >>  drivers/net/ixgbe/ixgbe_rxtx.c | 16 +---
>> > >>  1 file changed, 9 insertions(+), 7 deletions(-)
>> > >>
>> > >> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c 
>> > >> b/drivers/net/ixgbe/ixgbe_rxtx.c
>> > >> index 36f1c02..613890e 100644
>> > >> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
>> > >> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
>> > >> @@ -1460,17 +1460,19 @@ static inline int __attribute__((always_inline))
>> > >>  for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
>> > >>   i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
>> > >>  /* Read desc statuses backwards to avoid race 
>> > >> condition */
>> > >> -for (j = LOOK_AHEAD-1; j >= 0; --j)
>> > >> +for (j = 0; j < LOOK_AHEAD; j++)
>> > >>  s[j] = 
>> > >> rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
>> > >>
>> > >> -for (j = LOOK_AHEAD - 1; j >= 0; --j)
>> > >> -pkt_info[j] = 
>> > >> rte_le_to_cpu_32(rxdp[j].wb.lower.
>> > >> -   lo_dword.data);
>> > >> +rte_smp_rmb();
>> > >>
>> > >>  /* Compute how many status bits were set */
>> > >> -nb_dd = 0;
>> > >> -for (j = 0; j < LOOK_AHEAD; ++j)
>> > >> -nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
>> > >> +for (nb_dd = 0; nb_dd < LOOK_AHEAD &&
>> > >> +(s[nb_dd] & IXGBE_RXDADV_STAT_DD); 
>> > >> nb_dd++)
>> > >> +;
>> > >> +
>> > >> +for (j = 0; j < nb_dd; j++)
>> > >> +pkt_info[j] = 
>> > >> rte_le_to_cpu_32(rxdp[j].wb.lower.
>> > >> +   lo_dword.data);
>> > >>
>> > >>  nb_rx += nb_dd;
>> > >>
>> > >> --
>> > >
>> > > Acked-by: Konstantin Ananyev 
>> >
>> > Hi Konstantin,
>> >
>> > Is the ack valid for v3 and both patches?
>>
>> No, I didn't look into the second one in details.
>> It is ARM specific, and I left it for people who are more familiar with ARM 
>> then me :)
>> Konstantin
>
> Actually, I had a quick look after your mail.
>
> +   /* A.1 load 1 pkts desc */
> +   descs[0] =  vld1q_u64((uint64_t *)(rxdp));
> +   rte_smp_rmb();
>
> /* B.2 copy 2 mbuf point into rx_pkts  */
> vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
> @@ -271,10 +270,11 @@
> /* B.1 load 1 mbuf point */
> mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
>
> -   descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
> -   /* B.1 load 2 mbuf point */
> descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
> -   descs[0] =  vld1q_u64((uint64_t *)(rxdp));
> +
> +   /* A.1 load 2 pkts descs */
> +   descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
> +   descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
>
> Assuming that on all ARM-NEON platforms 16B reads are atomic,
> I think there is no need for smp_rmb() after the desc[0] read.
> What looks more appropriate to me:

With checking DDs in sequence, it doesn't matter much where the rmb is.
But there is a little performance improvement (0.02%) in my testing
with your suggestion.
So I'll send a new version. Thanks!

>
> descs[0] =  vld1q_u64((uint64_t *)(rxdp));
> descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
> descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
> descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
>
> rte_smp_rmb();
>
> ...
>
> But, as I said would be good if some ARM guys have a look here.
> Konstantin
>
>
>>
>> >
>> > Thanks,
>> > ferruh
>> >
>> > >
>> > >> 1.8.3.1
>> > >
>


[dpdk-dev] [PATCH v4 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2017-02-08 Thread Jianbo Liu
To get better performance, Rx bulk alloc recv function will scan 8 descs
in one time, but the statuses are not consistent on ARM platform because
the memory allocated for Rx descriptors is cacheable hugepages.
This patch is to calculate the number of received packets by scan DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 36f1c02..613890e 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1460,17 +1460,19 @@ static inline int __attribute__((always_inline))
for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
 i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
/* Read desc statuses backwards to avoid race condition */
-   for (j = LOOK_AHEAD-1; j >= 0; --j)
+   for (j = 0; j < LOOK_AHEAD; j++)
s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
 
-   for (j = LOOK_AHEAD - 1; j >= 0; --j)
-   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
-  lo_dword.data);
+   rte_smp_rmb();
 
/* Compute how many status bits were set */
-   nb_dd = 0;
-   for (j = 0; j < LOOK_AHEAD; ++j)
-   nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
+   for (nb_dd = 0; nb_dd < LOOK_AHEAD &&
+   (s[nb_dd] & IXGBE_RXDADV_STAT_DD); nb_dd++)
+   ;
+
+   for (j = 0; j < nb_dd; j++)
+   pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
+  lo_dword.data);
 
nb_rx += nb_dd;
 
-- 
1.8.3.1



[dpdk-dev] [PATCH v4 2/2] net/ixgbe: calculate correct number of received packets for ARM NEON-version vPMD

2017-02-08 Thread Jianbo Liu
vPMD will check 4 descs in one time, but the statuses are not consistent
because the memory allocated for RX descriptors is cacheable huagepage.
This patch is to calculate the number of received packets by scann DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c 
b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index f96cc85..e2715cb 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -196,7 +196,6 @@
struct ixgbe_rx_entry *sw_ring;
uint16_t nb_pkts_recd;
int pos;
-   uint64_t var;
uint8x16_t shuf_msk = {
0xFF, 0xFF,
0xFF, 0xFF,  /* skip 32 bits pkt_type */
@@ -255,26 +254,24 @@
uint64x2_t mbp1, mbp2;
uint8x16_t staterr;
uint16x8_t tmp;
+   uint32_t var = 0;
uint32_t stat;
 
/* B.1 load 1 mbuf point */
mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
 
-   /* Read desc statuses backwards to avoid race condition */
-   /* A.1 load 4 pkts desc */
-   descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
-   rte_rmb();
-
/* B.2 copy 2 mbuf point into rx_pkts  */
vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 
/* B.1 load 1 mbuf point */
mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
 
-   descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
-   /* B.1 load 2 mbuf point */
-   descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
+   /* A. load 4 pkts descs */
descs[0] =  vld1q_u64((uint64_t *)(rxdp));
+   descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
+   descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
+   descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
+   rte_smp_rmb();
 
/* B.2 copy 2 mbuf point into rx_pkts  */
vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
@@ -349,11 +346,19 @@
vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
 pkt_mb1);
 
+   stat &= IXGBE_VPMD_DESC_DD_MASK;
+
/* C.4 calc avaialbe number of desc */
-   var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
-   nb_pkts_recd += var;
-   if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+   if (likely(stat != IXGBE_VPMD_DESC_DD_MASK)) {
+   while (stat & 0x01) {
+   ++var;
+   stat = stat >> 8;
+   }
+   nb_pkts_recd += var;
break;
+   } else {
+   nb_pkts_recd += RTE_IXGBE_DESCS_PER_LOOP;
+   }
}
 
/* Update our internal tail pointer */
-- 
1.8.3.1



Re: [dpdk-dev] [PATCH 00/13] Introducing EAL Bus-Device-Driver Model

2016-12-12 Thread Jianbo Liu
Hi Shreyansh,

On 7 December 2016 at 21:10, Shreyansh Jain  wrote:
> On Wednesday 07 December 2016 05:47 PM, David Marchand wrote:
>>
>> Hello Shreyansh,
>>
>> On Wed, Dec 7, 2016 at 10:55 AM, Shreyansh Jain 
>> wrote:
>>>
>>> On Wednesday 07 December 2016 02:22 AM, David Marchand wrote:
>
> 0002~0003: Introducing the basic Bus model and associated test case
> 0005:  Support insertion of device rather than addition to tail



 Patch 2 and 5 could be squashed.
>>>
>>>
>>>
>>> I deliberately kept them separate. I intent to extend the Patch 5 for
>>> hotplugging. But, if I don't end up adding support for that in this
>>> series,
>>> I will merge these two.
>>
>>
>> Fine.
>>
>>
 The constructor priority stuff seems unneeded as long as we use
 explicit reference to a global (or local, did not check) bus symbol
 rather than a runtime lookup.
>>>
>>>
>>>
>>> I didn't understand your point here.
>>> IMO, constructor priority (or some other way to handle this) is
>>> important. I
>>> faced this issue while verifying it at my end when the drivers were
>>> getting
>>> registered before the bus.
>>>
>>> Can you elaborate more on '..use explicit reference to a global...'?
>>
>>
>> The drivers register themselves to a bus using this bus specific api.
>>
>> For pci, this is rte_eal_pci_register().
>> The pci_bus object must be moved to eal_common_pci.c (we can stil
>> internally expose for bsd / linux specific implementations).
>> Then, rte_eal_pci_register() can add the pci driver to the pci_bus
>> drivers list even if this pci_bus object is not registered yet to the
>> buses list.
>
>
> So, in eal_common_bus.c
>
> --->8---
>
> struct rte_bus *global_ptr_to_pci_bus = NULL;
>
> struct rte_bus pci_bus = { ... };
>
> rte_eal_pci_register() {
> if (global_ptr_to_pci_bus == NULL)
> rte_eal_bus_register(&pci_bus)
> else
>// continue as if PCI bus is registered
> }
>
> --->8---
>
> so, no RTE_REGISTER_BUS()?
>
> If yes, then RTE_REGISTER_BUS() should also check for an existing
> registration for duplication.
>
> I was banking on a model where bus handlers (or bus drivers) are independent
> entities, just like PMDs. So, we have a bus XYZ without any drivers
> necessarily based on it.
>
> By making registration dependent on driver registration, it becomes implicit
> that buses don't exist without drivers.
> I am not in favor of this - or maybe I lack enough reason for this (about
> how it will make framework/PMD life better).
>
>>
>> And no constructor order issue ?
>>
>>


> 0004:  Add scan and match callbacks for the Bus and updated test
> case



 Why do you push back the bus object in the 'scan' method ?
 This method is bus specific which means that the code "knows" the
 object registered with the callback.
>>>
>>>
>>>
>>> This 'knows' is the grey area for me.
>>> The bus (for example, PCI) after scanning needs to call
>>> rte_eal_bus_add_device() to link the device in bus's device_list.
>>>
>>> Two options:
>>> 1. Have a global reference to "pci" bus (rte_bus) somewhere in eal_pci.c
>>> 2. Call rte_eal_get_bus() every time someone needs the reference.
>>> 3. C++ style, 'this->'.
>>>
>>> I have taken the 3rd path. It simplifies my code to not assume a handle
>>> as
>>> well as not allow for reference fetch calls every now and then.
>>>
>>> As a disadvantage: it means passing this as argument - and some cases
>>> maintaining it as __rte_unused.
>>>
>>> Taking (1) or (2) is not advantageous than this approach.
>>
>>
>> 1) is the simplest one.
>>
>> When you write a pci_scan method and embed it in you pci_bus object,
>> but this pci_scan method still wonders which bus object it is supposed
>> to work on, this is a bit like Schizophrenia ;-).
>
>
> :)
> This now is linked to the above issue of constructor priority and having a
> global bus reference. I don't personally prefer it.
> I will still give this a serious thought, though.
>

I'm also in favor of (3).

>>
>>
 Is is that you want to have a single scan method used by multiple buses
 ?
>>>
>>>
>>>
>>> Yes, but only as a use case. For example, platform devices are of various
>>> types - what if we have a south-bound bus over a platform bus. In which
>>> case, a hierarchical bus layout is possible.
>>> But, this is far-fetched idea for now.
>>

How to express the hierarchical bus layout as the bus in your design
is more like independent objects to hold drivers and their devices?

>>
>> Well, if you have no usecase at the moment, let's keep it simple, please.
>>
>
> Ok.
>
>>

> 0006:  Integrate bus scan/match with EAL, without any effective
> driver



 Hard to find a right balance in patch splittng, but patch 4 and 6 are
 linked, I would squash them into one.
>>>
>>>
>>>
>>> Yes, it is hard and sometimes there is simply no strong rationale for
>>> splitting or merging. This is one of those cases.
>>> My idea was that one p

Re: [dpdk-dev] [PATCH 08/28] eal/arm64: define smp barrier definition for arm64

2016-12-15 Thread Jianbo Liu
On 14 December 2016 at 09:55, Jerin Jacob
 wrote:
> dmb instruction based barrier is used for smp version of memory barrier.
>
> Signed-off-by: Jerin Jacob 
> ---
>  lib/librte_eal/common/include/arch/arm/rte_atomic_64.h | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index bc7de64..78ebea2 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -82,11 +82,11 @@ static inline void rte_rmb(void)
> dsb(ld);
>  }
>
> -#define rte_smp_mb() rte_mb()
> +#define rte_smp_mb() dmb(ish)
>
> -#define rte_smp_wmb() rte_wmb()
> +#define rte_smp_wmb() dmb(ishst)
>
> -#define rte_smp_rmb() rte_rmb()
> +#define rte_smp_rmb() dmb(ishld)
>

rte_*mb are inline functions, while rte_smp_*mb are macro. As they are
all derived from dsb/dmb, can you keep them consistent?

>  #ifdef __cplusplus
>  }
> --
> 2.5.5
>


Re: [dpdk-dev] [PATCH 23/28] net/ixgbe: use eal I/O device memory read/write API

2016-12-15 Thread Jianbo Liu
On 14 December 2016 at 09:55, Jerin Jacob
 wrote:
> From: Santosh Shukla 
>
> Replace the raw I/O device memory read/write access with eal
> abstraction for I/O device memory read/write access to fix
> portability issues across different architectures.
>
> Signed-off-by: Santosh Shukla 
> Signed-off-by: Jerin Jacob 
> CC: Helin Zhang 
> CC: Konstantin Ananyev 
> ---
>  drivers/net/ixgbe/base/ixgbe_osdep.h | 13 +
>  1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h 
> b/drivers/net/ixgbe/base/ixgbe_osdep.h
> index 77f0af5..9d16c21 100644
> --- a/drivers/net/ixgbe/base/ixgbe_osdep.h
> +++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include "../ixgbe_logs.h"
>  #include "../ixgbe_bypass_defines.h"
> @@ -121,16 +122,20 @@ typedef int   bool;
>
>  #define prefetch(x) rte_prefetch0(x)
>
> -#define IXGBE_PCI_REG(reg) (*((volatile uint32_t *)(reg)))
> +#define IXGBE_PCI_REG(reg) ({  \
> +   uint32_t __val; \
> +   __val = rte_readl(reg); \
> +   __val;  \
> +})
>
>  static inline uint32_t ixgbe_read_addr(volatile void* addr)
>  {
> return rte_le_to_cpu_32(IXGBE_PCI_REG(addr));
>  }
>
> -#define IXGBE_PCI_REG_WRITE(reg, value) do { \
> -   IXGBE_PCI_REG((reg)) = (rte_cpu_to_le_32(value)); \
> -} while(0)
> +#define IXGBE_PCI_REG_WRITE(reg, value) ({ \
> +   rte_writel(rte_cpu_to_le_32(value), reg);   \
> +})
>

memory barrier operation is put inside IXGBE_PCI_REG_READ/WRITE in
your change, but I found rte_*mb is called before these macros in some
places.
Can you remove all these redundant calls? And please do the same
checking for other drivers.

>  #define IXGBE_PCI_REG_ADDR(hw, reg) \
> ((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
> --
> 2.5.5
>


Re: [dpdk-dev] [PATCH 13/28] eal/arm64: override I/O device read/write access for arm64

2016-12-15 Thread Jianbo Liu
On 14 December 2016 at 09:55, Jerin Jacob
 wrote:
> Override the generic I/O device memory read/write access and implement it
> using armv8 instructions for arm64.
>
> Signed-off-by: Jerin Jacob 
> ---
>  lib/librte_eal/common/include/arch/arm/rte_io.h|   4 +
>  lib/librte_eal/common/include/arch/arm/rte_io_64.h | 183 
> +
>  2 files changed, 187 insertions(+)
>  create mode 100644 lib/librte_eal/common/include/arch/arm/rte_io_64.h
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_io.h 
> b/lib/librte_eal/common/include/arch/arm/rte_io.h
> index 74c1f2c..9593b42 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_io.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_io.h
> @@ -38,7 +38,11 @@
>  extern "C" {
>  #endif
>
> +#ifdef RTE_ARCH_64
> +#include "rte_io_64.h"
> +#else
>  #include "generic/rte_io.h"
> +#endif
>
>  #ifdef __cplusplus
>  }
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_io_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_io_64.h
> new file mode 100644
> index 000..09e7a89
> --- /dev/null
> +++ b/lib/librte_eal/common/include/arch/arm/rte_io_64.h
> @@ -0,0 +1,183 @@
> +/*
> + *   BSD LICENSE
> + *
> + *   Copyright (C) Cavium networks Ltd. 2016.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + *   notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + *   notice, this list of conditions and the following disclaimer in
> + *   the documentation and/or other materials provided with the
> + *   distribution.
> + * * Neither the name of Cavium networks nor the names of its
> + *   contributors may be used to endorse or promote products derived
> + *   from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _RTE_IO_ARM64_H_
> +#define _RTE_IO_ARM64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include 
> +
> +#define RTE_OVERRIDE_IO_H
> +
> +#include "generic/rte_io.h"
> +#include "rte_atomic_64.h"
> +
> +static inline __attribute__((always_inline)) uint8_t
> +__rte_arm64_readb(const volatile void *addr)
> +{
> +   uint8_t val;
> +
> +   asm volatile(
> +   "ldrb %w[val], [%x[addr]]"
> +   : [val] "=r" (val)
> +   : [addr] "r" (addr));
> +   return val;
> +}
> +
> +static inline __attribute__((always_inline)) uint16_t
> +__rte_arm64_readw(const volatile void *addr)
> +{
> +   uint16_t val;
> +
> +   asm volatile(
> +   "ldrh %w[val], [%x[addr]]"
> +   : [val] "=r" (val)
> +   : [addr] "r" (addr));
> +   return val;
> +}
> +
> +static inline __attribute__((always_inline)) uint32_t
> +__rte_arm64_readl(const volatile void *addr)
> +{
> +   uint32_t val;
> +
> +   asm volatile(
> +   "ldr %w[val], [%x[addr]]"
> +   : [val] "=r" (val)
> +   : [addr] "r" (addr));
> +   return val;
> +}
> +
> +static inline __attribute__((always_inline)) uint64_t
> +__rte_arm64_readq(const volatile void *addr)
> +{
> +   uint64_t val;
> +
> +   asm volatile(
> +   "ldr %x[val], [%x[addr]]"
> +   : [val] "=r" (val)
> +   : [addr] "r" (addr));
> +   return val;
> +}
> +
> +static inline __attribute__((always_inline)) void
> +__rte_arm64_writeb(uint8_t val, volatile void *addr)
> +{
> +   asm volatile(
> +   "strb %w[val], [%x[addr]]"
> +   :
> +   : [val] "r" (val), [addr] "r" (addr));
> +}
> +
> +static inline __attribute__((always_inline)) void
> +__rte_arm64_writew(uint16_t val, volatile void *addr)
> +{
> +   asm volatile(
> +   "strh %w[val], [%x[addr]]"
> +   :
> +   : [val] "r" (val), [addr] "r" (addr));
> +}
> +
> +

Re: [dpdk-dev] [PATCH 13/28] eal/arm64: override I/O device read/write access for arm64

2016-12-15 Thread Jianbo Liu
On 15 December 2016 at 18:04, Jerin Jacob
 wrote:
> On Thu, Dec 15, 2016 at 05:53:05PM +0800, Jianbo Liu wrote:
>> On 14 December 2016 at 09:55, Jerin Jacob
>>  wrote:
>> > Override the generic I/O device memory read/write access and implement it
>> > using armv8 instructions for arm64.
>> >
>> > Signed-off-by: Jerin Jacob 
>> > ---
>> >  lib/librte_eal/common/include/arch/arm/rte_io.h|   4 +
>> >  lib/librte_eal/common/include/arch/arm/rte_io_64.h | 183 
>> > +
>> >  2 files changed, 187 insertions(+)
>> >  create mode 100644 lib/librte_eal/common/include/arch/arm/rte_io_64.h
>> >
>> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_io.h 
>> > b/lib/librte_eal/common/include/arch/arm/rte_io.h
>> > index 74c1f2c..9593b42 100644
>> > --- a/lib/librte_eal/common/include/arch/arm/rte_io.h
>> > +++ b/lib/librte_eal/common/include/arch/arm/rte_io.h
>> > @@ -38,7 +38,11 @@
>> >  extern "C" {
>> >  #endif
>> >
>> > +#ifdef RTE_ARCH_64
>> > +#include "rte_io_64.h"
>> > +#else
>> >  #include "generic/rte_io.h"
>> > +#endif
>> >
>> >  #ifdef __cplusplus
>> >  }
>> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_io_64.h 
>> > b/lib/librte_eal/common/include/arch/arm/rte_io_64.h
>> > new file mode 100644
>> > index 000..09e7a89
>> > --- /dev/null
>> > +++ b/lib/librte_eal/common/include/arch/arm/rte_io_64.h
>> > @@ -0,0 +1,183 @@
>> > +/*
>> > + *   BSD LICENSE
>> > + *
>> > + *   Copyright (C) Cavium networks Ltd. 2016.
>> > + *
>> > + *   Redistribution and use in source and binary forms, with or without
>> > + *   modification, are permitted provided that the following conditions
>> > + *   are met:
>> > + *
>> > + * * Redistributions of source code must retain the above copyright
>> > + *   notice, this list of conditions and the following disclaimer.
>> > + * * Redistributions in binary form must reproduce the above copyright
>> > + *   notice, this list of conditions and the following disclaimer in
>> > + *   the documentation and/or other materials provided with the
>> > + *   distribution.
>> > + * * Neither the name of Cavium networks nor the names of its
>> > + *   contributors may be used to endorse or promote products derived
>> > + *   from this software without specific prior written permission.
>> > + *
>> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
>> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
>> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
>> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
>> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
>> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> > + */
>> > +
>> > +#ifndef _RTE_IO_ARM64_H_
>> > +#define _RTE_IO_ARM64_H_
>> > +
>> > +#ifdef __cplusplus
>> > +extern "C" {
>> > +#endif
>> > +
>> > +#include 
>> > +
>> > +#define RTE_OVERRIDE_IO_H
>> > +
>> > +#include "generic/rte_io.h"
>> > +#include "rte_atomic_64.h"
>> > +
>> > +static inline __attribute__((always_inline)) uint8_t
>> > +__rte_arm64_readb(const volatile void *addr)
>> > +{
>> > +   uint8_t val;
>> > +
>> > +   asm volatile(
>> > +   "ldrb %w[val], [%x[addr]]"
>> > +   : [val] "=r" (val)
>> > +   : [addr] "r" (addr));
>> > +   return val;
>> > +}
>> > +
>> > +static inline __attribute__((always_inline)) uint16_t
>> > +__rte_arm64_readw(const volatile void *addr)
>> > +{
>> > +   uint16_t val;
>>

Re: [dpdk-dev] [PATCH 13/28] eal/arm64: override I/O device read/write access for arm64

2016-12-16 Thread Jianbo Liu
On 15 December 2016 at 19:08, Jerin Jacob
 wrote:
> On Thu, Dec 15, 2016 at 06:17:32PM +0800, Jianbo Liu wrote:
>> On 15 December 2016 at 18:04, Jerin Jacob
>>  wrote:
>> > On Thu, Dec 15, 2016 at 05:53:05PM +0800, Jianbo Liu wrote:
>> >> On 14 December 2016 at 09:55, Jerin Jacob
>> >>  wrote:
>> >> > Override the generic I/O device memory read/write access and implement 
>> >> > it
>> >> > using armv8 instructions for arm64.
>> >> >
>> >> > Signed-off-by: Jerin Jacob 
>> >> > ---
>> >> >  lib/librte_eal/common/include/arch/arm/rte_io.h|   4 +
>> >> >  lib/librte_eal/common/include/arch/arm/rte_io_64.h | 183 
>> >> > +
>> >> >  2 files changed, 187 insertions(+)
>> >> >  create mode 100644 lib/librte_eal/common/include/arch/arm/rte_io_64.h
>> >> >
>> >> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_io.h 
>> >> > b/lib/librte_eal/common/include/arch/arm/rte_io.h
>> >> > index 74c1f2c..9593b42 100644
>> >> > --- a/lib/librte_eal/common/include/arch/arm/rte_io.h
>> >> > +++ b/lib/librte_eal/common/include/arch/arm/rte_io.h
>> >> > @@ -38,7 +38,11 @@
>> >> >  extern "C" {
>> >> >  #endif
>> >> >
>> >> > +#ifdef RTE_ARCH_64
>> >> > +#include "rte_io_64.h"
>> >> > +#else
>> >> >  #include "generic/rte_io.h"
>> >> > +#endif
>> >> >
>> >> >  #ifdef __cplusplus
>> >> >  }
>> >> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_io_64.h 
>> >> > b/lib/librte_eal/common/include/arch/arm/rte_io_64.h
>> >> > new file mode 100644
>> >> > index 000..09e7a89
>> >> > --- /dev/null
>> >> > +++ b/lib/librte_eal/common/include/arch/arm/rte_io_64.h
>> >> > @@ -0,0 +1,183 @@
>> >> > +/*
>> >> > + *   BSD LICENSE
>> >> > + *
>> >> > + *   Copyright (C) Cavium networks Ltd. 2016.
>> >> > + *
>> >> > + *   Redistribution and use in source and binary forms, with or without
>> >> > + *   modification, are permitted provided that the following conditions
>> >> > + *   are met:
>> >> > + *
>> >> > + * * Redistributions of source code must retain the above copyright
>> >> > + *   notice, this list of conditions and the following disclaimer.
>> >> > + * * Redistributions in binary form must reproduce the above 
>> >> > copyright
>> >> > + *   notice, this list of conditions and the following disclaimer 
>> >> > in
>> >> > + *   the documentation and/or other materials provided with the
>> >> > + *   distribution.
>> >> > + * * Neither the name of Cavium networks nor the names of its
>> >> > + *   contributors may be used to endorse or promote products 
>> >> > derived
>> >> > + *   from this software without specific prior written permission.
>> >> > + *
>> >> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 
>> >> > CONTRIBUTORS
>> >> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> >> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
>> >> > FOR
>> >> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
>> >> > COPYRIGHT
>> >> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
>> >> > INCIDENTAL,
>> >> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> >> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 
>> >> > USE,
>> >> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
>> >> > ANY
>> >> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
>> >> > TORT
>> >> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
>> >> > USE
>> >> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 
>> >> > DAMAGE.
>> >> > + */
>> >> > +
>> >&g

[dpdk-dev] [PATCH 2/2] net/ixgbe: calculate correct number of received packets for ARM NEON-version vPMD

2016-12-18 Thread Jianbo Liu
vPMD will check 4 descriptors in one time, but the statuses are not consistent
because the memory allocated for RX descriptors is cacheable huagepage.
This patch is to calculate the number of received packets by scanning DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c 
b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
index f96cc85..0b1338d 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
@@ -196,7 +196,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
struct ixgbe_rx_entry *sw_ring;
uint16_t nb_pkts_recd;
int pos;
-   uint64_t var;
uint8x16_t shuf_msk = {
0xFF, 0xFF,
0xFF, 0xFF,  /* skip 32 bits pkt_type */
@@ -255,6 +254,7 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
uint64x2_t mbp1, mbp2;
uint8x16_t staterr;
uint16x8_t tmp;
+   uint32_t var = 0;
uint32_t stat;
 
/* B.1 load 1 mbuf point */
@@ -349,11 +349,19 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
 pkt_mb1);
 
+   stat &= IXGBE_VPMD_DESC_DD_MASK;
+
/* C.4 calc avaialbe number of desc */
-   var =  __builtin_popcount(stat & IXGBE_VPMD_DESC_DD_MASK);
-   nb_pkts_recd += var;
-   if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
+   if (likely(var != IXGBE_VPMD_DESC_DD_MASK)) {
+   while (stat & 0x01) {
+   ++var;
+   stat = stat >> 8;
+   }
+   nb_pkts_recd += var;
break;
+   } else {
+   nb_pkts_recd += RTE_IXGBE_DESCS_PER_LOOP;
+   }
}
 
/* Update our internal tail pointer */
-- 
2.4.11



[dpdk-dev] [PATCH 1/2] net/ixgbe: calculate the correct number of received packets in bulk alloc function

2016-12-18 Thread Jianbo Liu
To get better performance, Rx bulk alloc recv function will scan 8 descriptors
in one time, but the statuses are not consistent on ARM platform because
the memory allocated for Rx descriptors is cacheable hugepages.
This patch is to calculate the number of received packets by scanning DD bit
sequentially, and stops when meeting the first packet with DD bit unset.

Signed-off-by: Jianbo Liu 
---
 drivers/net/ixgbe/ixgbe_rxtx.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index b2d9f45..2866bdb 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -1402,17 +1402,21 @@ ixgbe_rx_scan_hw_ring(struct ixgbe_rx_queue *rxq)
for (i = 0; i < RTE_PMD_IXGBE_RX_MAX_BURST;
 i += LOOK_AHEAD, rxdp += LOOK_AHEAD, rxep += LOOK_AHEAD) {
/* Read desc statuses backwards to avoid race condition */
-   for (j = LOOK_AHEAD-1; j >= 0; --j)
+   for (j = LOOK_AHEAD - 1; j >= 0; --j) {
s[j] = rte_le_to_cpu_32(rxdp[j].wb.upper.status_error);
-
-   for (j = LOOK_AHEAD - 1; j >= 0; --j)
pkt_info[j] = rte_le_to_cpu_32(rxdp[j].wb.lower.
   lo_dword.data);
+   }
+
+   rte_smp_rmb();
 
/* Compute how many status bits were set */
nb_dd = 0;
for (j = 0; j < LOOK_AHEAD; ++j)
-   nb_dd += s[j] & IXGBE_RXDADV_STAT_DD;
+   if (s[j] & IXGBE_RXDADV_STAT_DD)
+   ++nb_dd;
+   else
+   break;
 
nb_rx += nb_dd;
 
-- 
2.4.11



Re: [dpdk-dev] [PATCH 2/2] net/ixgbe: calculate correct number of received packets for ARM NEON-version vPMD

2016-12-21 Thread Jianbo Liu
Hi Jerin,

On 21 December 2016 at 18:08, Jerin Jacob
 wrote:
> On Mon, Dec 19, 2016 at 11:39:18AM +0530, Jianbo Liu wrote:
>
> Hi Jianbo,
>
>> vPMD will check 4 descriptors in one time, but the statuses are not 
>> consistent
>> because the memory allocated for RX descriptors is cacheable huagepage.
> Is it different in X86 case ?i.e Is x86 creating non cacheable hugepages?
> I am just looking at what it takes to fix similar issues for all drivers wrt 
> armv8.
>
> Are you able to reproduce this issue any armv8 platform. If so, could
> you please the platform detail and commands to reproduce this issue?
>

I have tested on Huawei D03 and Softiron with Intel X540, same issue
for both of them.
The setup is very simple: loopback 2 ports, then run testpmd.


Re: [dpdk-dev] [PATCH 2/2] net/ixgbe: calculate correct number of received packets for ARM NEON-version vPMD

2016-12-21 Thread Jianbo Liu
On 21 December 2016 at 19:03, Bruce Richardson
 wrote:
> On Wed, Dec 21, 2016 at 03:38:51PM +0530, Jerin Jacob wrote:
>> On Mon, Dec 19, 2016 at 11:39:18AM +0530, Jianbo Liu wrote:
>>
>> Hi Jianbo,
>>
>> > vPMD will check 4 descriptors in one time, but the statuses are not 
>> > consistent
>> > because the memory allocated for RX descriptors is cacheable huagepage.
>> Is it different in X86 case ?i.e Is x86 creating non cacheable hugepages?
>
> This is not a problem on IA, because the instruction ordering rules on
> IA guarantee that the reads will be done in the correct program order,
> and we never get stale cache data.
>

Yes, I think it's an issue for ARM arch.
It's because more than one cacheline-sized data (4/8 descriptors can
be in two cachelines) will be read at one time in bulk alloc RX or
vPMD.
There is the same issue for i40e, I'll send the same patch later.


Re: [dpdk-dev] [PATCH 23/28] net/ixgbe: use eal I/O device memory read/write API

2016-12-22 Thread Jianbo Liu
Hi Santosh,

On 22 December 2016 at 20:36, Santosh Shukla
 wrote:
> Hi Jiangbo,
>
> On Thu, Dec 15, 2016 at 08:40:19PM -0800, Santosh Shukla wrote:
>> On Thu, Dec 15, 2016 at 04:37:12PM +0800, Jianbo Liu wrote:
>> > On 14 December 2016 at 09:55, Jerin Jacob
>> >  wrote:
>> > > From: Santosh Shukla 
>> > >
>> >
>> > memory barrier operation is put inside IXGBE_PCI_REG_READ/WRITE in
>> > your change, but I found rte_*mb is called before these macros in some
>> > places.
>> > Can you remove all these redundant calls? And please do the same
>> > checking for other drivers.
>> >
>>
>> Ok.
>>
>> Thinking of adding _relaxed_rd/wr style macro agnostic to arch for ixgbe case
>> in particular. Such that for those code incident:
>> x86 case> first default barrier + relaxed call.
>> arm case> first default barrier + relaxed call.
>>
>> Does that make sense to you? If so then will take care in v2.
>>
>> Santosh.
>
> We spend time looking at drivers code where double barrier
> may happen. Most of them are in driver init path,
> configuration/control path code. So keeping double
> barrier won't impact performance.
>
> We plan to replace only fast path code with _relaxed
> style API's. That way we won't impact each driver
> performance and we'll have the clean port.
>
> Does it make sense? Thought?
>

Yes, please continue your work.


Re: [dpdk-dev] [PATCH v2 07/29] eal/arm64: fix memory barrier definition for arm64

2017-01-02 Thread Jianbo Liu
On 27 December 2016 at 17:49, Jerin Jacob
 wrote:
> dsb instruction based barrier is used for non smp
> version of memory barrier.
>
> Fixes: d708f01b7102 ("eal/arm: add atomic operations for ARMv8")
>
> CC: Jianbo Liu 
> CC: sta...@dpdk.org
> Signed-off-by: Jerin Jacob 
> ---
>  lib/librte_eal/common/include/arch/arm/rte_atomic_64.h | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index d854aac..bc7de64 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -43,7 +43,8 @@ extern "C" {
>
>  #include "generic/rte_atomic.h"
>
> -#define dmb(opt)  do { asm volatile("dmb " #opt : : : "memory"); } while (0)
> +#define dsb(opt)  { asm volatile("dsb " #opt : : : "memory"); }
> +#define dmb(opt)  { asm volatile("dmb " #opt : : : "memory"); }
>
>  /**
>   * General memory barrier.
> @@ -54,7 +55,7 @@ extern "C" {
>   */
>  static inline void rte_mb(void)
>  {
> -   dmb(ish);
> +   dsb(sy);
>  }
>
>  /**
> @@ -66,7 +67,7 @@ static inline void rte_mb(void)
>   */
>  static inline void rte_wmb(void)
>  {
> -   dmb(ishst);
> +   dsb(st);
>  }
>
>  /**
> @@ -78,7 +79,7 @@ static inline void rte_wmb(void)
>   */
>  static inline void rte_rmb(void)
>  {
> -   dmb(ishld);
> +   dsb(ld);
>  }
>
>  #define rte_smp_mb() rte_mb()
> --
> 2.5.5
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH v2 09/29] eal/arm64: define I/O device memory barriers for arm64

2017-01-02 Thread Jianbo Liu
On 27 December 2016 at 17:49, Jerin Jacob
 wrote:
> CC: Jianbo Liu 
> Signed-off-by: Jerin Jacob 
> ---
>  lib/librte_eal/common/include/arch/arm/rte_atomic_64.h | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index 78ebea2..ef0efc7 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -88,6 +88,12 @@ static inline void rte_rmb(void)
>
>  #define rte_smp_rmb() dmb(ishld)
>
> +#define rte_io_mb() rte_mb()
> +
> +#define rte_io_wmb() rte_wmb()
> +
> +#define rte_io_rmb() rte_rmb()
> +

I think it's better to use outer shareable dmb for io barrier, instead of dsb.


Re: [dpdk-dev] [PATCH v2 14/29] eal/arm64: change barrier definitions to macros

2017-01-02 Thread Jianbo Liu
On 27 December 2016 at 17:49, Jerin Jacob
 wrote:
> Change rte_?wb definitions to macros in order to

use rte_*mb?

> keep consistent with other barrier definitions in
> the file.
>
> Suggested-by: Jianbo Liu 
> Signed-off-by: Jerin Jacob 
> ---
>  .../common/include/arch/arm/rte_atomic_64.h| 36 
> ++
>  1 file changed, 3 insertions(+), 33 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index ef0efc7..dc3a0f3 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -46,41 +46,11 @@ extern "C" {
>  #define dsb(opt)  { asm volatile("dsb " #opt : : : "memory"); }
>  #define dmb(opt)  { asm volatile("dmb " #opt : : : "memory"); }
>
> -/**
> - * General memory barrier.
> - *
> - * Guarantees that the LOAD and STORE operations generated before the
> - * barrier occur before the LOAD and STORE operations generated after.
> - * This function is architecture dependent.
> - */
> -static inline void rte_mb(void)
> -{
> -   dsb(sy);
> -}
> +#define rte_mb() dsb(sy)
>
> -/**
> - * Write memory barrier.
> - *
> - * Guarantees that the STORE operations generated before the barrier
> - * occur before the STORE operations generated after.
> - * This function is architecture dependent.
> - */
> -static inline void rte_wmb(void)
> -{
> -   dsb(st);
> -}
> +#define rte_wmb() dsb(st)
>
> -/**
> - * Read memory barrier.
> - *
> - * Guarantees that the LOAD operations generated before the barrier
> - * occur before the LOAD operations generated after.
> - * This function is architecture dependent.
> - */

How about keep the comments for all these macros?

> -static inline void rte_rmb(void)
> -{
> -   dsb(ld);
> -}
> +#define rte_rmb() dsb(ld)
>
>  #define rte_smp_mb() dmb(ish)
>
> --
> 2.5.5
>


Re: [dpdk-dev] [PATCH v2 09/29] eal/arm64: define I/O device memory barriers for arm64

2017-01-04 Thread Jianbo Liu
On 4 January 2017 at 18:01, Jerin Jacob  wrote:
> On Tue, Jan 03, 2017 at 03:48:32PM +0800, Jianbo Liu wrote:
>> On 27 December 2016 at 17:49, Jerin Jacob
>>  wrote:
>> > CC: Jianbo Liu 
>> > Signed-off-by: Jerin Jacob 
>> > ---
>> >  lib/librte_eal/common/include/arch/arm/rte_atomic_64.h | 6 ++
>> >  1 file changed, 6 insertions(+)
>> >
>> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
>> > b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > index 78ebea2..ef0efc7 100644
>> > --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> > @@ -88,6 +88,12 @@ static inline void rte_rmb(void)
>> >
>> >  #define rte_smp_rmb() dmb(ishld)
>> >
>> > +#define rte_io_mb() rte_mb()
>> > +
>> > +#define rte_io_wmb() rte_wmb()
>> > +
>> > +#define rte_io_rmb() rte_rmb()
>> > +
>>
>> I think it's better to use outer shareable dmb for io barrier, instead of 
>> dsb.
>
> Its is difficult to generalize. AFAIK, from the IO barrier perspective
> dsb would be the right candidate. But just for the DMA barrier between IO may
> be outer sharable dmb is enough. In-terms of performance implication, the
> fastpath code(door bell write) has been changed to relaxed write in all
> the drivers in this patchset and rte_io_* will be only
> used by rte_[read/write]8/16/32/64 which will be in slow-path.
> So, IMO, it better stick with dsb and its safe from the complete IO barrier
> perspective.

If so, why not use *mb() directly?

>
> At least on ThunderX, I couldn't see any performance difference between
> using dsb(st) and dmb(oshst) for dma write barrier before the doorbell 
> register
> write in fastpath. In case there are platforms which has such performance 
> difference,
> may be could add rte_dma_wmb() and rte_dma_rmb() in future like Linux kernel
> dma_wmb() and dma_rmb().(But i couldn't  see all the driver are using it,
> though)
>

But there is no io_*mb() in the kernel, so you want to be different?


Re: [dpdk-dev] [PATCH v2 09/29] eal/arm64: define I/O device memory barriers for arm64

2017-01-04 Thread Jianbo Liu
On 5 January 2017 at 14:24, Jerin Jacob  wrote:
> On Thu, Jan 05, 2017 at 01:31:44PM +0800, Jianbo Liu wrote:
>> On 4 January 2017 at 18:01, Jerin Jacob  
>> wrote:
>> > On Tue, Jan 03, 2017 at 03:48:32PM +0800, Jianbo Liu wrote:
>> >> On 27 December 2016 at 17:49, Jerin Jacob
>> >>  wrote:
>> >> > CC: Jianbo Liu 
>> >> > Signed-off-by: Jerin Jacob 
>> >> > ---
>> >> >  lib/librte_eal/common/include/arch/arm/rte_atomic_64.h | 6 ++
>> >> >  1 file changed, 6 insertions(+)
>> >> >
>> >> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
>> >> > b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> >> > index 78ebea2..ef0efc7 100644
>> >> > --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> >> > +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
>> >> > @@ -88,6 +88,12 @@ static inline void rte_rmb(void)
>> >> >
>> >> >  #define rte_smp_rmb() dmb(ishld)
>> >> >
>> >> > +#define rte_io_mb() rte_mb()
>> >> > +
>> >> > +#define rte_io_wmb() rte_wmb()
>> >> > +
>> >> > +#define rte_io_rmb() rte_rmb()
>> >> > +
>> >>
>> >> I think it's better to use outer shareable dmb for io barrier, instead of 
>> >> dsb.
>> >
>> > Its is difficult to generalize. AFAIK, from the IO barrier perspective
>> > dsb would be the right candidate. But just for the DMA barrier between IO 
>> > may
>> > be outer sharable dmb is enough. In-terms of performance implication, the
>> > fastpath code(door bell write) has been changed to relaxed write in all
>> > the drivers in this patchset and rte_io_* will be only
>> > used by rte_[read/write]8/16/32/64 which will be in slow-path.
>> > So, IMO, it better stick with dsb and its safe from the complete IO barrier
>> > perspective.
>>
>> If so, why not use *mb() directly?
>
> Adding David Marchand, EAL Maintainer.
>
> Instead of rte_io_?. I thought, IO specific constraints can be abstracted
> here in rte_io_*. Apart from arm, there other arch like "arc" has similar
> constraints. IMHO, no harm in keeping that abstraction.
>
> Thoughts ?
>
> http://lxr.free-electrons.com/ident?i=__iormb
>
>>
>> >
>> > At least on ThunderX, I couldn't see any performance difference between
>> > using dsb(st) and dmb(oshst) for dma write barrier before the doorbell 
>> > register
>> > write in fastpath. In case there are platforms which has such performance 
>> > difference,
>> > may be could add rte_dma_wmb() and rte_dma_rmb() in future like Linux 
>> > kernel
>> > dma_wmb() and dma_rmb().(But i couldn't  see all the driver are using it,
>> > though)
>> >
>>
>> But there is no io_*mb() in the kernel, so you want to be different?
>
> It is their for arm,arm64,arc architectures in Linux kernel. Please check 
> writel
> implementation for arm64
>
> http://lxr.free-electrons.com/source/arch/arm64/include/asm/io.h#L143
>

Yes, I knew. But I'm afraid it will be mixed with dma_*mb by someone else.


[dpdk-dev] [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform

2017-07-04 Thread Jianbo Liu
v5:
  - rebase to master
  Please apply after "move gcc version definition tocommon header"
 http://www.dpdk.org/ml/archives/dev/2017-July/070031.html

v4:
  - add vcopyq_laneq_u32 for older version of gcc

v3:
  - remove unnecessary perfetch for rte_mbuf
  - fix typo in git log
  - Ashwin's suggestions for performance on ThunderX

v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (8):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  arch/arm: add vcopyq_laneq_u32 for old version of gcc
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard macro name for header file

 examples/l3fwd/l3fwd_common.h  | 293 +
 examples/l3fwd/l3fwd_em.c  |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h  | 218 +++
 examples/l3fwd/l3fwd_em_hlm_neon.h |  74 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h  | 276 +--
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}  |  24 +-
 examples/l3fwd/l3fwd_lpm.c |  87 +-
 examples/l3fwd/l3fwd_lpm.h |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h| 193 ++
 examples/l3fwd/l3fwd_lpm_sse.h |  66 -
 examples/l3fwd/l3fwd_neon.h| 259 ++
 examples/l3fwd/l3fwd_sse.h | 261 +-
 lib/librte_eal/common/include/arch/arm/rte_vect.h  |   9 +
 13 files changed, 1165 insertions(+), 629 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (88%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1



[dpdk-dev] [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup

2017-07-04 Thread Jianbo Liu
Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h | 302 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 276 +-
 3 files changed, 308 insertions(+), 272 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 000..9fb5ff6
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static __rte_always_inline void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+   uint8_t portid, uint16_t dst_port[8])
+{
+   int32_t ret[8];
+   union ipv4_5tuple_host key[8];
+
+   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+   &key[4], &key[5], &key[6], &key[7]};
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[0]]);
+   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[1]]);
+   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[2]]);
+   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[3]]);
+   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[4]]);
+   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[5]]);
+   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[6]]);
+   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[7]]);
+
+   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[0]) == 0)
+   dst_port[0] = portid;
+
+   if (dst_port[1] >= RTE_MAX_ETHPORTS ||
+

[dpdk-dev] [PATCH v5 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd

2017-07-04 Thread Jianbo Liu
Some common code can be used by other ARCHs, move to l3fwd_lpm.c

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_lpm.c | 83 ++
 examples/l3fwd/l3fwd_lpm.h | 26 +
 examples/l3fwd/l3fwd_lpm_sse.h | 66 -
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..ff8d10b 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm *ipv4_l3fwd_lookup_struct =
+   (struct rte_lpm *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+   &next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+   (struct rte_lpm6 *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+   &next_hop) == 0) ?  next_hop : portid);
+}
+
+static __rte_always_inline uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+   uint8_t portid)
+{
+   struct ipv6_hdr *ipv6_hdr;
+   struct ipv4_hdr *ipv4_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+qconf->ipv4_lookup_struct);
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+qconf->ipv6_lookup_struct);
+   }
+
+   return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static __rte_always_inline uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf 
*pkt,
+   uint32_t dst_ipv4, uint8_t portid)
+{
+   uint32_t next_hop;
+   struct ipv6_hdr *ipv6_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+   return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+  dst_ipv4, &next_hop) == 0)
+  ? next_hop : portid);
+
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+   ipv6_hdr->dst_addr, &next_hop) == 0)
+   ? next_hop : portid);
+
+   }
+
+   return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 4d77b58..55c3e83 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm *ipv4_l3fwd_lookup_struct =
-   (struct rte_lpm *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-   &next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-   (struct rte_lpm6 *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-   &next_hop) == 0) ?  next_hop : portid);
-}
-
 static __rte_always_inline void
 l3fwd_lpm_simple_forward(str

[dpdk-dev] [PATCH v5 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc

2017-07-04 Thread Jianbo Liu
Implement vcopyq_laneq_u32 if gcc version is lower than 7.

Signed-off-by: Jianbo Liu 
---
 lib/librte_eal/common/include/arch/arm/rte_vect.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h 
b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..d9fb4d0 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -78,6 +78,15 @@
 }
 #endif
 
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 7)
+static inline uint32x4_t
+vcopyq_laneq_u32(uint32x4_t a, const int lane_a,
+uint32x4_t b, const int lane_b)
+{
+   return vsetq_lane_u32(vgetq_lane_u32(b, lane_b), a, lane_a);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.8.3.1



[dpdk-dev] [PATCH v5 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h

2017-07-04 Thread Jianbo Liu
The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c| 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h 
b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1



[dpdk-dev] [PATCH v5 3/8] examples/l3fwd: extract common code from multi packet send

2017-07-04 Thread Jianbo Liu
Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_common.h | 293 ++
 examples/l3fwd/l3fwd_sse.h| 261 +
 2 files changed, 297 insertions(+), 257 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 000..2867365
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#defineIPV4_MIN_VER_IHL0x45
+#defineIPV4_MAX_VER_IHL0x4f
+#defineIPV4_MAX_VER_IHL_DIFF   (IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#defineIPV4_MIN_LEN_BE (sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static __rte_always_inline void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+   uint8_t ihl;
+
+   if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+   ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+   ipv4_hdr->time_to_live--;
+   ipv4_hdr->hdr_checksum++;
+
+   if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+   ((uint8_t)ipv4_hdr->total_length == 0 &&
+   ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+   dp[0] = BAD_PORT;
+
+   }
+}
+
+#else
+#definerfc1812_process(mb, dp, ptype)  do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#defineGRPSZ   (1 << FWDSTEP)
+#defineGRPMSK  (GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
+   if (likely((dlp) == (dcp)[(idx)])) { \
+   (lp)[0]++;   \
+   } else { \
+   (dlp) = (dcp)[idx];  \
+   (lp) = (pn) + (idx); \
+   (lp)[0] = 1; \
+   } 

[dpdk-dev] [PATCH v5 6/8] examples/l3fwd: add neon support for l3fwd

2017-07-04 Thread Jianbo Liu
Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c|   4 +-
 examples/l3fwd/l3fwd_em_hlm.h|  17 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++
 examples/l3fwd/l3fwd_em_sequential.h |  18 ++-
 examples/l3fwd/l3fwd_lpm.c   |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h  | 193 ++
 examples/l3fwd/l3fwd_neon.h  | 259 +++
 7 files changed, 563 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
if (nb_rx == 0)
continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
l3fwd_em_send_packets(nb_rx, pkts_burst,
portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 9fb5ff6..aa3e561 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static __rte_always_inline void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
uint8_t portid, struct lcore_conf *qconf)
 {
-   int32_t j;
+   int32_t i, j, pos;
uint16_t dst_port[MAX_PKT_BURST];
 
/*
@@ -247,6 +252,11 @@
 */
int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+   for (j = 0; j < 8 && j < nb_rx; j++) {
+   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+  struct ether_hdr *) + 1);
+   }
+
for (j = 0; j < n; j += 8) {
 
uint32_t pkt_type =
@@ -263,6 +273,11 @@
uint32_t tcp_or_udp = pkt_type &
(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+   for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+  struct ether_hdr *) + 1);
+   }
+
if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h 
b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INT

[dpdk-dev] [PATCH v5 8/8] examples/l3fwd: change the guard macro name for header file

2017-07-04 Thread Jianbo Liu
As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h 
b/examples/l3fwd/l3fwd_em_sequential.h
index 4baccf1..6b34733 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -123,4 +123,4 @@
 
send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1



[dpdk-dev] [PATCH v5 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs

2017-07-04 Thread Jianbo Liu
New macro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +-
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index aa3e561..707c7fc 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static __rte_always_inline void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-   uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+   uint8_t portid, uint16_t dst_port[])
 {
-   int32_t ret[8];
-   union ipv4_5tuple_host key[8];
-
-   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-   &key[4], &key[5], &key[6], &key[7]};
-
-   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[0]]);
-   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[1]]);
-   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[2]]);
-   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[3]]);
-   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[4]]);
-   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[5]]);
-   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[6]]);
-   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[7]]);
-
-   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[0]) == 0)
-   dst_port[0] = portid;
-
-   if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[1]) == 0)
-   dst_port[1] = portid;
-
-   if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[2]) == 0)
-   dst_port[2] = portid;
-
-   if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[3]) == 0)
-   dst_port[3] = portid;
-
-   if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[4]) == 0)
-   dst_port[4] = portid;
-
-   if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[5]) == 0)
-   dst_port[5] = portid;
-
-   if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[6]) == 0)
-   dst_port[6] = portid;
-
-   if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[7]) == 0)
-   dst_port[7] = portid;
+   int i;
+   int32_t ret[EM_HASH_LOOKUP_COUNT];
+   union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+   const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+   get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+   key_array[i] = &key[i];
+   }
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+EM_HASH_LOOKUP_COUNT, ret);
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+   dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[i]]);
 
+   if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[i]) == 0)
+   dst_port[i] = portid;
+   }
 }
 
 static __rte_always_inline void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-   uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv6xN(struct lcore_conf *qconf, s

Re: [dpdk-dev] [PATCH] examples/ip_pipeline: use crc32 in hash functions for arm64

2017-07-04 Thread Jianbo Liu
On 4 July 2017 at 21:55, De Lara Guarch, Pablo
 wrote:
>
>
>> -Original Message-
>> From: Thomas Monjalon [mailto:tho...@monjalon.net]
>> Sent: Tuesday, July 4, 2017 12:26 AM
>> To: Dumitrescu, Cristian ; De Lara Guarch,
>> Pablo 
>> Cc: dev@dpdk.org; Jianbo Liu ;
>> jerin.ja...@caviumnetworks.com; ashwin.sek...@caviumnetworks.com
>> Subject: Re: [dpdk-dev] [PATCH] examples/ip_pipeline: use crc32 in hash
>> functions for arm64
>>
>> 04/07/2017 01:19, Dumitrescu, Cristian:
>> > From: Thomas Monjalon [mailto:tho...@monjalon.net]
>> > > 18/05/2017 11:09, Jianbo Liu:
>> > > > Implement the same hash functions with crc32 on arm platform.
>> > > >
>> > > > Signed-off-by: Jianbo Liu 
>> > > > ---
>> > > >  examples/ip_pipeline/pipeline/hash_func.h   |   2 +
>> > > >  examples/ip_pipeline/pipeline/hash_func_arm64.h | 245
>> > > 
>> > > >  2 files changed, 247 insertions(+)  create mode 100644
>> > > > examples/ip_pipeline/pipeline/hash_func_arm64.h
>> > >
>> > > I don't understand why this code is in an example.
>> > > We have some CRC code in librte_hash, librte_net and ip_pipeline.
>> > > Cristian, Jianbo,
>> > > does it make sense to move these functions somewhere else?
>> > >
>> >
>> > I think example apps are a great way to propose new hash functions.
>> > IMO we should encourage the definition/exploration of new hash
>> functions in our example apps.
>> >
>> > These functions are examples of how fast hash functions can be built
>> using special CPU instructions.
>> > They have much better performance than e.g. jhash, but their
>> > properties are largely unknown, as no rigorous study on their
>> > properties (such as uniform distribution) has been conducted. I have
>> > seen them providing good performance  for the data set I have been
>> using, but I have no extensive data to support their maturity level.
>> >
>> > If somebody is willing to invest the effort in proving them, I would
>> > be more than happy to see them moved to a library like librte_hash.
>> > Pablo as maintainer has the choice (I think it is not the first time
>> > we discuss bout these hash funcs :) )
>> >
>> > As mentioned in one of our deprecation notices, I am actively working
>> > (not ready for 17.8 unfortunately) to add a key mask parameter to these
>> functions, so more work on these hash functions is likely to take place.
>>
>> OK thanks for the explanation.
>> I still think we do not need to prove hash for integrating them.
>> I would be interested to read Pablo's opinion.
>
> If these functions are used as hash functions, I would place them in rte_hash.
>
> The case where we placed the CRC function in librte_net was because that
> was not used as a hash function, so it made sense to me placing it there,
> but in this case, it looks like it is, so I think rte_hash is a valid place
> (although someone would need to integrate it with the existing CRC hash 
> function in that library).
>

I think Cristian explanation justified using the special hash functions here.
And they may have better performance than the standard functions in the library.

Thanks!
Jianbo


Re: [dpdk-dev] [PATCH] eal/armv7: emulate vaddvq u16 variant

2017-07-09 Thread Jianbo Liu
On 8 July 2017 at 00:26, Jerin Jacob  wrote:
> vaddvq_u16() is not available for armv7.
> Emulate the vaddvq_u16() using armv7 NEON intrinsics.
>
> Signed-off-by: Jerin Jacob 
> ---
>  lib/librte_eal/common/include/arch/arm/rte_vect.h | 11 +++
>  1 file changed, 11 insertions(+)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h 
> b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> index 0670ca2ee..69fd428f3 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> @@ -77,6 +77,17 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
>
> return vld1q_u8(rte_ret.u8);
>  }
> +
> +static inline uint16_t
> +vaddvq_u16(uint16x8_t a)
> +{
> +   uint32x4_t m = vpaddlq_u16(a);
> +   uint64x2_t n = vpaddlq_u32(m);
> +   uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
> +
> +   return vget_lane_u32((uint32x2_t)o, 0);
> +}
> +
>  #endif
>
>  #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 7)
> --
> 2.13.2
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH] eal/armv7: emulate vaddvq u16 variant

2017-07-09 Thread Jianbo Liu
On 9 July 2017 at 01:08, Thomas Monjalon  wrote:
> 07/07/2017 18:26, Jerin Jacob:
>> vaddvq_u16() is not available for armv7.
>> Emulate the vaddvq_u16() using armv7 NEON intrinsics.
>
> After implementing this function, another missing function appears:
>
> lib/librte_sched/rte_sched.c:1747:7: error:
> implicit declaration of function ‘vminvq_u32’

But sched_vector is disabled in defconfig_arm-armv7a-linuxapp-gcc:
CONFIG_RTE_SCHED_VECTOR=n


Re: [dpdk-dev] [PATCH] test: Fix memory corruption issues which fails the link_bonding test.

2017-07-10 Thread Jianbo Liu
_SLAVES; i++) {
> -   for (j = 0; j < MAX_PKT_BURST; j++) {
> -   if (pkt_burst[i][j] != NULL) {
> -   rte_pktmbuf_free(pkt_burst[i][j]);
> -   pkt_burst[i][j] = NULL;
> -   }
> -   }
> -   }
> -
> /* Clean up and remove slaves from bonded device */
> return remove_slaves_and_stop_bonded_device();
>  }
> @@ -4527,18 +4491,6 @@ struct rte_fdir_conf fdir_conf = {
> "(%d) port_stats.ipackets not as expected\n",
> test_params->bonded_port_id);
>
> -   /* free mbufs */
> -
> -   for (i = 0; i < 
> TEST_ADAPTIVE_TRANSMIT_LOAD_BALANCING_RX_BURST_SLAVE_COUNT; i++) {
> -   for (j = 0; j < MAX_PKT_BURST; j++) {
> -   if (pkt_burst[i][j] != NULL) {
> -   rte_pktmbuf_free(pkt_burst[i][j]);
> -   pkt_burst[i][j] = NULL;
> -   }
> -   }
> -   }
> -
> -
> /* Clean up and remove slaves from bonded device */
> return remove_slaves_and_stop_bonded_device();
>  }
> --
> 1.8.3.1
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH] ARMv8: Use built-in data types for unsupported poly64/128 types for GCC version lower than 4.9.0.

2017-07-12 Thread Jianbo Liu
On 12 July 2017 at 18:50, Herbert Guan  wrote:
> Fixes: 3c4b4024c2 (arch/arm: add vcopyq_laneq_u32 for old gcc)
>
> Signed-off-by: Herbert Guan 
> ---
>  lib/librte_eal/common/include/arch/arm/rte_vect.h | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h 
> b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> index 7fec25e..782350d 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> @@ -101,6 +101,13 @@
>
>  #if defined(RTE_ARCH_ARM64)
>  #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 7)
> +
> +#if (GCC_VERSION < 40900)
> +typedef uint64_t poly64_t;
> +typedef uint64x2_t poly64x2_t;
> +typedef uint8_t poly128_t __attribute__((vector_size(16), aligned(16)));
> +#endif
> +
>  /* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 
> */
>  static inline uint64x2_t
>  vreinterpretq_u64_p128(poly128_t x)
> --
> 1.8.3.1
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH] test: fix the parameter issue of test case

2017-07-13 Thread Jianbo Liu
On 13 July 2017 at 16:00, Herbert Guan  wrote:
> When test case "test_balance_l23_tx_burst_ipv4_toggle_ip_addr" is
> calling balance_l23_tx_burst(), the ip_addr instead of mac_addr
> should be toggled according to the test name.
>
> Signed-off-by: Herbert Guan 
> ---
>  test/test/test_link_bonding.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/test/test/test_link_bonding.c b/test/test/test_link_bonding.c
> index aa2a1a2..12dd91d 100644
> --- a/test/test/test_link_bonding.c
> +++ b/test/test/test_link_bonding.c
> @@ -2789,7 +2789,7 @@ struct rte_fdir_conf fdir_conf = {
>  static int
>  test_balance_l23_tx_burst_ipv4_toggle_ip_addr(void)
>  {
> -   return balance_l23_tx_burst(0, 1, 1, 0);
> +   return balance_l23_tx_burst(0, 1, 0, 1);
>  }
>
>  static int
> --
> 1.8.3.1
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH v1] mempool/dpaa2: add DPAA2 hardware offloaded mempool

2017-03-21 Thread Jianbo Liu
Hi Hemant,

The 03/17/2017 18:17, Hemant Agrawal wrote:
> DPAA2 Hardware Mempool handlers allow enqueue/dequeue from NXP's
> QBMAN hardware block.
> CONFIG_RTE_MBUF_DEFAULT_MEMPOOL_OPS is set to 'dpaa2', if the pool
> is enabled.
>
> This memory pool currently supports packet mbuf type blocks only.

Do you plan to support multi-process for this hardware mempool?

>
> Signed-off-by: Hemant Agrawal 
> ---
>  MAINTAINERS|   1 +
>  config/common_base |   5 +
>  config/defconfig_arm64-dpaa2-linuxapp-gcc  |   8 +
>  drivers/Makefile   |   1 +
>  drivers/bus/Makefile   |   2 +
>  drivers/mempool/Makefile   |  40 +++
>  drivers/mempool/dpaa2/Makefile |  72 
>  drivers/mempool/dpaa2/dpaa2_hw_mempool.c   | 374 
> +
>  drivers/mempool/dpaa2/dpaa2_hw_mempool.h   |  91 +
>  .../mempool/dpaa2/rte_mempool_dpaa2_version.map|   8 +
>  10 files changed, 602 insertions(+)
>  create mode 100644 drivers/mempool/Makefile
>  create mode 100644 drivers/mempool/dpaa2/Makefile
>  create mode 100644 drivers/mempool/dpaa2/dpaa2_hw_mempool.c
>  create mode 100644 drivers/mempool/dpaa2/dpaa2_hw_mempool.h
>  create mode 100644 drivers/mempool/dpaa2/rte_mempool_dpaa2_version.map
>
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.


Re: [dpdk-dev] [PATCH] net/i40e: fix incorrect packet index reference

2017-03-05 Thread Jianbo Liu
On 4 March 2017 at 13:00, Jerin Jacob  wrote:
> Fixes: ae0eb310f253 ("net/i40e: implement vector PMD for ARM")
>
> CC: sta...@dpdk.org
> Signed-off-by: Jerin Jacob 
> Signed-off-by: Sunil Kulkarni 
> ---
>  drivers/net/i40e/i40e_rxtx_vec_neon.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c 
> b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> index 011c54e..d235daa 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> @@ -205,7 +205,7 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf 
> **rx_pkts)
> for (i = 0; i < 4; i++) {
> tmp = vreinterpretq_u8_u64(vshrq_n_u64(descs[i], 30));
> ptype = vgetq_lane_u8(tmp, 8);
> -   rx_pkts[0]->packet_type = i40e_rxd_pkt_type_mapping(ptype);
> +   rx_pkts[i]->packet_type = i40e_rxd_pkt_type_mapping(ptype);
> }
>
>  }

Acked-by: Jianbo Liu 


[dpdk-dev] [PATCH v7 0/7] vhost: optimize mergeable Rx path

2016-10-18 Thread Jianbo Liu
On 14 October 2016 at 17:34, Yuanhan Liu  wrote:
> This is a new set of patches to optimize the mergeable Rx code path.
> No refactoring (rewrite) was made this time. It just applies some
> findings from Zhihong (kudos to him!) that could improve the mergeable
> Rx path on the old code.
..

> ---
> Yuanhan Liu (4):
>   vhost: simplify mergeable Rx vring reservation
>   vhost: use last avail idx for avail ring reservation
>   vhost: prefetch avail ring
>   vhost: retrieve avail head once
>
> Zhihong Wang (3):
>   vhost: remove useless volatile
>   vhost: optimize cache access
>   vhost: shadow used ring update
>
>  lib/librte_vhost/vhost.c  |  13 ++-
>  lib/librte_vhost/vhost.h  |   5 +-
>  lib/librte_vhost/vhost_user.c |  23 +++--
>  lib/librte_vhost/virtio_net.c | 193 
> +-
>  4 files changed, 149 insertions(+), 85 deletions(-)
>

Reviewed-by: Jianbo Liu 


[dpdk-dev] [PATCH v3 00/15] Introduce SoC device/driver framework for EAL

2016-09-18 Thread Jianbo Liu
On 9 September 2016 at 16:43, Shreyansh Jain  wrote:
> Introduction:
> =
>
> This patch set is direct derivative of Jan's original series [1],[2].
>
>  - As this deviates substantially from original series, if need be I can
>post it as a separate patch rather than v2. Please suggest.
>  - Also, there are comments on original v1 ([4]) which are _not_
>incorporated in this series as they refer to section no more in new
>version.
>  - This v3 version is based on the rte_driver/device patchset v9 [10].
>That series introduced device structures (rte_driver/rte_device)
>generalizing devices into PCI, VDEV, XXX. For the purpose of this
>patchset, XXX=>SOC.
>
> Aim:
> 
>
> As of now EAL is primarly focused on PCI initialization/probing.
>
>  rte_eal_init()
>   |- rte_eal_pci_init(): Find PCI devices from sysfs
>   |- ...
>   |- rte_eal_memzone_init()
>   |- ...
>   `- rte_eal_pci_probe(): Driver<=>Device initialization
>
> This patchset introduces SoC framework which would enable SoC drivers and
> drivers to be plugged into EAL, very similar to how PCI drivers/devices are
> done today.
>
> This is a stripped down version of PCI framework which allows the SoC PMDs
> to implement their own routines for detecting devices and linking devices to
> drivers.
>
> 1) Changes to EAL
>  rte_eal_init()
>   |- rte_eal_pci_init(): Find PCI devices from sysfs
>   |- rte_eal_soc_init(): Calls PMDs->scan_fn
>   |- ...
>   |- rte_eal_memzone_init()
>   |- ...
>   |- rte_eal_pci_probe(): Driver<=>Device initialization, PMD->devinit()
>   `- rte_eal_soc_probe(): Calls PMDs->match_fn and PMDs->devinit();
>
> 2) New device/driver structures:
>   - rte_soc_driver (inheriting rte_driver)
>   - rte_soc_device (inheriting rte_device)
>   - rte_eth_dev and eth_driver embedded rte_soc_device and rte_soc_driver,
> respectively.
>
> 3) The SoC PMDs need to:
>  - define rte_soc_driver with necessary scan and match callbacks
>  - Register themselves using DRIVER_REGISTER_SOC()
>  - Implement respective bus scanning in the scan callbacks to add necessary
>devices to SoC device list
>  - Implement necessary eth_dev_init/uninint for ethernet instances
>
> 4) Design considerations that are same as PCI:
>  - SoC initialization is being done through rte_eal_init(), just after PCI
>initialization is done.
>  - As in case of PCI, probe is done after rte_eal_pci_probe() to link the
>devices detected with the drivers registered.
>  - Device attach/detach functions are available and have been designed on
>the lines of PCI framework.
>  - PMDs register using DRIVER_REGISTER_SOC, very similar to
>DRIVER_REGISTER_PCI for PCI devices.
>  - Linked list of SoC driver and devices exists independent of the other
>driver/device list, but inheriting rte_driver/rte_driver, these are also
>part of a global list.
>
> 5) Design considerations that are different from PCI:
>  - Each driver implements its own scan and match function. PCI uses the BDF
>format to read the device from sysfs, but this _may_not_ be a case for a
>SoC ethernet device.
>= This is an important change from initial proposal by Jan in [2]. Unlike
>his attempt to use /sys/bus/platform, this patch relies on the PMD to

It could be many redundant code if Each PMD driver has the scan
function if its own.
I think Jan's implementation is common to many platform drivers.

>detect the devices. This is because SoC may require specific or
>additional info for device detection. Further, SoC may have embedded

Can you give us more precise definition about SoC driver? Does it
include the driver in ARM server?

>devices/MACs which require initialization which cannot be covered through
>sysfs parsing.

I think it can be done in devinit, not in scan function. devinit can
be different for each driver.

>= PCI based PMDs rely on EAL's capability to detect devices. This
>proposal puts the onus on PMD to detect devices, add to soc_device_list
>and wait for Probe. Matching, of device<=>driver is again PMD's callback.
>


[dpdk-dev] [PATCH v3 00/15] Introduce SoC device/driver framework for EAL

2016-09-18 Thread Jianbo Liu
On 18 September 2016 at 15:22, Jan Viktorin  wrote:
> On Sun, 18 Sep 2016 13:58:50 +0800
> Jianbo Liu  wrote:
>
>> On 9 September 2016 at 16:43, Shreyansh Jain  
>> wrote:
>> > Introduction:
>> > =
>> >
>> > This patch set is direct derivative of Jan's original series [1],[2].
>> >
>> >  - As this deviates substantially from original series, if need be I can
>> >post it as a separate patch rather than v2. Please suggest.
>> >  - Also, there are comments on original v1 ([4]) which are _not_
>> >incorporated in this series as they refer to section no more in new
>> >version.
>> >  - This v3 version is based on the rte_driver/device patchset v9 [10].
>> >That series introduced device structures (rte_driver/rte_device)
>> >generalizing devices into PCI, VDEV, XXX. For the purpose of this
>> >patchset, XXX=>SOC.
>
> [...]
>
>> >
>> > 5) Design considerations that are different from PCI:
>> >  - Each driver implements its own scan and match function. PCI uses the BDF
>> >format to read the device from sysfs, but this _may_not_ be a case for a
>> >SoC ethernet device.
>> >= This is an important change from initial proposal by Jan in [2]. 
>> > Unlike
>> >his attempt to use /sys/bus/platform, this patch relies on the PMD to
>>
>> It could be many redundant code if Each PMD driver has the scan
>> function if its own.
>> I think Jan's implementation is common to many platform drivers.
>
> I personally can find a use case for having a custom scan function.
> However, we should at least provide a default implementation. Probably,
> both the scan and match functions should be used to _override_ a default
> behaviour. So, only drivers that require to scan devices in a specific
> way would provide a custom function for this.
>
And for each platform/product

> I agree, that this can sometimes lead to code duplication. Moreover, it
> opens door for a very non-standard, unsecure and wrong-by-design
> approaches. I'd like more to provide one or more scan implementations
> in EAL and do not put this responsibility on PMDs.
>
>>
>> >detect the devices. This is because SoC may require specific or
>> >additional info for device detection. Further, SoC may have embedded
>
> Can you provide an example for "additional info for device detection"?
>
>>
>> Can you give us more precise definition about SoC driver? Does it
>> include the driver in ARM server?
>
> I am sorry but I don't understand this question.
>
> What you mean by a "driver in ARM server"? Do you mean a kernel driver?
>
> There is no "SoC driver" in the text so what definition are asking for?
>
This patchset introduces rte_soc_driver, which is inheriting from rte_driver.
I want to know what devices can use this SoC driver/device framework.
Is it for the devices from ARM servers, or embedded systems of
different vendors?
And this framework is too generalized, if we don't try to understand
"soc" in rte_soc_driver, we can use it for PCI devices. :)

Thanks!
Jianbo


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-21 Thread Jianbo Liu
Hi Maxime,

On 22 August 2016 at 16:11, Maxime Coquelin  
wrote:
> Hi Zhihong,
>
> On 08/19/2016 07:43 AM, Zhihong Wang wrote:
>>
>> This patch set optimizes the vhost enqueue function.
>>
...

>
> My setup consists of one host running a guest.
> The guest generates as much 64bytes packets as possible using

Have you tested with other different packet size?
My testing shows that performance is dropping when packet size is more than 256.

> pktgen-dpdk. The hosts forwards received packets back to the guest
> using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
> physical CPUs.
>
> I tested it with and without your v1 patch, with and without
> rx-mergeable feature turned ON.
> Results are the average of 8 runs of 60 seconds:
>
> Rx-Mergeable ON : 7.72Mpps
> Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
> Rx-Mergeable OFF: 10.52Mpps
> Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
>
> Regards,
> Maxime


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-21 Thread Jianbo Liu
On 21 September 2016 at 17:27, Wang, Zhihong  wrote:
>
>
>> -Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu at linaro.org]
>> Sent: Wednesday, September 21, 2016 4:50 PM
>> To: Maxime Coquelin 
>> Cc: Wang, Zhihong ; dev at dpdk.org;
>> yuanhan.liu at linux.intel.com
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> Hi Maxime,
>>
>> On 22 August 2016 at 16:11, Maxime Coquelin
>>  wrote:
>> > Hi Zhihong,
>> >
>> > On 08/19/2016 07:43 AM, Zhihong Wang wrote:
>> >>
>> >> This patch set optimizes the vhost enqueue function.
>> >>
>> ...
>>
>> >
>> > My setup consists of one host running a guest.
>> > The guest generates as much 64bytes packets as possible using
>>
>> Have you tested with other different packet size?
>> My testing shows that performance is dropping when packet size is more
>> than 256.
>
>
> Hi Jianbo,
>
> Thanks for reporting this.
>
>  1. Are you running the vector frontend with mrg_rxbuf=off?
>
>  2. Could you please specify what CPU you're running? Is it Haswell
> or Ivy Bridge?
>
>  3. How many percentage of drop are you seeing?
>
> This is expected by me because I've already found the root cause and
> the way to optimize it, but since it missed the v0 deadline and
> requires changes in eal/memcpy, I postpone it to the next release.
>
> After the upcoming optimization the performance for packets larger
> than 256 will be improved, and the new code will be much faster than
> the current code.
>

Sorry, I tested on an ARM server, but I wonder if there is the same
issue for x86 platform.

>> > pktgen-dpdk. The hosts forwards received packets back to the guest
>> > using testpmd on vhost pmd interface. Guest's vCPUs are pinned to
>> > physical CPUs.
>> >
>> > I tested it with and without your v1 patch, with and without
>> > rx-mergeable feature turned ON.
>> > Results are the average of 8 runs of 60 seconds:
>> >
>> > Rx-Mergeable ON : 7.72Mpps
>> > Rx-Mergeable ON + "vhost: optimize enqueue" v1: 9.19Mpps
>> > Rx-Mergeable OFF: 10.52Mpps
>> > Rx-Mergeable OFF + "vhost: optimize enqueue" v1: 10.60Mpps
>> >


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-22 Thread Jianbo Liu
On 22 September 2016 at 10:29, Yuanhan Liu  
wrote:
> On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
>> >> > My setup consists of one host running a guest.
>> >> > The guest generates as much 64bytes packets as possible using
>> >>
>> >> Have you tested with other different packet size?
>> >> My testing shows that performance is dropping when packet size is more
>> >> than 256.
>> >
>> >
>> > Hi Jianbo,
>> >
>> > Thanks for reporting this.
>> >
>> >  1. Are you running the vector frontend with mrg_rxbuf=off?
>> >
Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.

>> >  2. Could you please specify what CPU you're running? Is it Haswell
>> > or Ivy Bridge?
>> >
It's an ARM server.

>> >  3. How many percentage of drop are you seeing?
The testing result:
size (bytes) improvement (%)
64   3.92
128 11.51
256  24.16
512  -13.79
1024-22.51
1500-12.22
A correction is that performance is dropping if byte size is larger than 512.

>> >
>> > This is expected by me because I've already found the root cause and
>> > the way to optimize it, but since it missed the v0 deadline and
>> > requires changes in eal/memcpy, I postpone it to the next release.
>> >
>> > After the upcoming optimization the performance for packets larger
>> > than 256 will be improved, and the new code will be much faster than
>> > the current code.
>> >
>>
>> Sorry, I tested on an ARM server, but I wonder if there is the same
>> issue for x86 platform.
>
> Would you please provide more details? Say, answer the two left
> questions from Zhihong?
>
> Thanks.
>
> --yliu


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-22 Thread Jianbo Liu
On 22 September 2016 at 14:58, Wang, Zhihong  wrote:
>
>
>> -Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu at linaro.org]
>> Sent: Thursday, September 22, 2016 1:48 PM
>> To: Yuanhan Liu 
>> Cc: Wang, Zhihong ; Maxime Coquelin
>> ; dev at dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 22 September 2016 at 10:29, Yuanhan Liu 
>> wrote:
>> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
>> >> >> > My setup consists of one host running a guest.
>> >> >> > The guest generates as much 64bytes packets as possible using
>> >> >>
>> >> >> Have you tested with other different packet size?
>> >> >> My testing shows that performance is dropping when packet size is
>> more
>> >> >> than 256.
>> >> >
>> >> >
>> >> > Hi Jianbo,
>> >> >
>> >> > Thanks for reporting this.
>> >> >
>> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
>> >> >
>> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
>>
>> >> >  2. Could you please specify what CPU you're running? Is it Haswell
>> >> > or Ivy Bridge?
>> >> >
>> It's an ARM server.
>>
>> >> >  3. How many percentage of drop are you seeing?
>> The testing result:
>> size (bytes) improvement (%)
>> 64   3.92
>> 128 11.51
>> 256  24.16
>> 512  -13.79
>> 1024-22.51
>> 1500-12.22
>> A correction is that performance is dropping if byte size is larger than 512.
>
>
> Jianbo,
>
> Could you please verify does this patch really cause enqueue perf to drop?
>
> You can test the enqueue path only by set guest to do rxonly, and compare
> the mpps by show port stats all in the guest.
>
>
Tested with testpmd, host: txonly, guest: rxonly
size (bytes) improvement (%)
644.12
128   6
256   2.65
512   -1.12
1024 -7.02


[dpdk-dev] [PATCH v6 2/6] vhost: rewrite enqueue

2016-09-22 Thread Jianbo Liu
On 20 September 2016 at 10:00, Zhihong Wang  wrote:
> This patch implements the vhost logic from scratch into a single function
> to improve maintainability. This is the baseline version of the new code,
> more optimization will be added in the following patches in this patch set.
>
> In the existing code there're 2 callbacks for vhost enqueue:
>
>  *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.
>
>  *  virtio_dev_rx for mrg_rxbuf turned off cases.
>
> Having 2 callback paths increases maintenance effort. Also, the performance
> of the existing code is not optimal, especially when the mrg_rxbuf feature
> turned on.
>
> Signed-off-by: Zhihong Wang 
> ---
> Changes in v6:
.

> -/*
> - * Returns -1 on fail, 0 on success
> - */
> -static inline int
> -reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
> -   uint16_t *end, struct buf_vector *buf_vec)
> +uint16_t
> +rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> +   struct rte_mbuf **pkts, uint16_t count)
>  {
> -   uint16_t cur_idx;
> +   struct vhost_virtqueue *vq;
> +   struct virtio_net *dev;
> +   uint32_t is_mrg_rxbuf = 0;
> +   uint32_t pkt_idx  = 0;
> +   uint32_t pkt_left = count;

Is pkt_left really needed?

> uint16_t avail_idx;
> -   uint32_t allocated = 0;
> -   uint32_t vec_idx = 0;
> -   uint16_t tries = 0;



[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-22 Thread Jianbo Liu
On 22 September 2016 at 18:04, Wang, Zhihong  wrote:
>
>
>> -Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu at linaro.org]
>> Sent: Thursday, September 22, 2016 5:02 PM
>> To: Wang, Zhihong 
>> Cc: Yuanhan Liu ; Maxime Coquelin
>> ; dev at dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 22 September 2016 at 14:58, Wang, Zhihong 
>> wrote:
>> >
>> >
>> >> -Original Message-
>> >> From: Jianbo Liu [mailto:jianbo.liu at linaro.org]
>> >> Sent: Thursday, September 22, 2016 1:48 PM
>> >> To: Yuanhan Liu 
>> >> Cc: Wang, Zhihong ; Maxime Coquelin
>> >> ; dev at dpdk.org
>> >> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>> >>
>> >> On 22 September 2016 at 10:29, Yuanhan Liu
>> 
>> >> wrote:
>> >> > On Wed, Sep 21, 2016 at 08:54:11PM +0800, Jianbo Liu wrote:
>> >> >> >> > My setup consists of one host running a guest.
>> >> >> >> > The guest generates as much 64bytes packets as possible using
>> >> >> >>
>> >> >> >> Have you tested with other different packet size?
>> >> >> >> My testing shows that performance is dropping when packet size is
>> >> more
>> >> >> >> than 256.
>> >> >> >
>> >> >> >
>> >> >> > Hi Jianbo,
>> >> >> >
>> >> >> > Thanks for reporting this.
>> >> >> >
>> >> >> >  1. Are you running the vector frontend with mrg_rxbuf=off?
>> >> >> >
>> >> Yes, my testing is mrg_rxbuf=off, but not vector frontend PMD.
>> >>
>> >> >> >  2. Could you please specify what CPU you're running? Is it Haswell
>> >> >> > or Ivy Bridge?
>> >> >> >
>> >> It's an ARM server.
>> >>
>> >> >> >  3. How many percentage of drop are you seeing?
>> >> The testing result:
>> >> size (bytes) improvement (%)
>> >> 64   3.92
>> >> 128 11.51
>> >> 256  24.16
>> >> 512  -13.79
>> >> 1024-22.51
>> >> 1500-12.22
>> >> A correction is that performance is dropping if byte size is larger than 
>> >> 512.
>> >
>> >
>> > Jianbo,
>> >
>> > Could you please verify does this patch really cause enqueue perf to drop?
>> >
>> > You can test the enqueue path only by set guest to do rxonly, and compare
>> > the mpps by show port stats all in the guest.
>> >
>> >
>> Tested with testpmd, host: txonly, guest: rxonly
>> size (bytes) improvement (%)
>> 644.12
>> 128   6
>> 256   2.65
>> 512   -1.12
>> 1024 -7.02
>
>
>
> I think your number is little bit hard to understand for me, this patch's
> optimization contains 2 parts:
>
>  1. ring operation: works for both mrg_rxbuf on and off
>
>  2. remote write ordering: works for mrg_rxbuf=on only
>
> So, for mrg_rxbuf=off, if this patch is good for 64B packets, then it
> shouldn't do anything bad for larger packets.
>
> This is the gain on x86 platform: host iofwd between nic and vhost,
> guest rxonly.
>
> nic2vm  enhancement
> 64  21.83%
> 128 16.97%
> 256 6.34%
> 512 0.01%
> 10240.00%
>
I bootup a VM with 2 virtual port, and stress the traffic between them.
First, I stressed with pktgen-dpdk in VM, and did iofwd in host.
Then, as you told, I did rxonly in VM, and txonly in host.

> I suspect there's some complication in ARM's micro-arch.
>
> Could you try v6 and apply all patches except the the last one:
> [PATCH v6 6/6] vhost: optimize cache access
>
> And see if there's still perf drop?
>
The last patch can improve the performance. The drop is actually
caused by the second patch.

Jianbo


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-23 Thread Jianbo Liu
On 23 September 2016 at 10:56, Wang, Zhihong  wrote:
.
> This is expected because the 2nd patch is just a baseline and all optimization
> patches are organized in the rest of this patch set.
>
> I think you can do bottleneck analysis on ARM to see what's slowing down the
> perf, there might be some micro-arch complications there, mostly likely in
> memcpy.
>
> Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
>
> Could you publish the mrg_rxbuf=on data also? Since it's more widely used
> in terms of spec integrity.
>
I don't think it will be helpful for you, considering the differences
between x86 and arm.
So please move on with this patchset...

Thanks!
Jianbo


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-26 Thread Jianbo Liu
Hi Thomas,

On 23 September 2016 at 21:41, Thomas Monjalon
 wrote:
> 2016-09-23 18:41, Jianbo Liu:
>> On 23 September 2016 at 10:56, Wang, Zhihong  
>> wrote:
>> .
>> > This is expected because the 2nd patch is just a baseline and all 
>> > optimization
>> > patches are organized in the rest of this patch set.
>> >
>> > I think you can do bottleneck analysis on ARM to see what's slowing down 
>> > the
>> > perf, there might be some micro-arch complications there, mostly likely in
>> > memcpy.
>> >
>> > Do you use glibc's memcpy? I suggest to hand-crafted it on your own.
>> >
>> > Could you publish the mrg_rxbuf=on data also? Since it's more widely used
>> > in terms of spec integrity.
>> >
>> I don't think it will be helpful for you, considering the differences
>> between x86 and arm.
>> So please move on with this patchset...
>
> Jianbo,
> I don't understand.
> You said that the 2nd patch is a regression:
> -   volatile uint16_t   last_used_idx;
> +   uint16_tlast_used_idx;
>
No, I meant "vhost: rewrite enqueue".

> And the overrall series lead to performance regression
> for packets > 512 B, right?
> But we don't know wether you have tested the v6 or not.
Yes, I tested v6, and found performance regression for size >=512B.

>
> Zhihong talked about some improvements possible in rte_memcpy.
> ARM64 is using libc memcpy in rte_memcpy.
>
> Now you seem to give up.
> Does it mean you accept having a regression in 16.11 release?
> Are you working on rte_memcpy?
This patchset actually improves performance according to Zhihong's
result on x86 platfrom. And I also get improvement as least with
small-size packet on ARM.
I don't want to give up, but I need more time to find out the reason
for the regression. I think rte_memcpy definitely is one of the ways
to improve performance, but it could be the reason?


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-26 Thread Jianbo Liu
On 25 September 2016 at 13:41, Wang, Zhihong  wrote:
>
>
>> -Original Message-
>> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
>> Sent: Friday, September 23, 2016 9:41 PM
>> To: Jianbo Liu 
>> Cc: dev at dpdk.org; Wang, Zhihong ; Yuanhan Liu
>> ; Maxime Coquelin
>> 

> This patch does help in ARM for small packets like 64B sized ones,
> this actually proves the similarity between x86 and ARM in terms
> of caching optimization in this patch.
>
> My estimation is based on:
>
>  1. The last patch are for mrg_rxbuf=on, and since you said it helps
> perf, we can ignore it for now when we discuss mrg_rxbuf=off
>
>  2. Vhost enqueue perf =
> Ring overhead + Virtio header overhead + Data memcpy overhead
>
>  3. This patch helps small packets traffic, which means it helps
> ring + virtio header operations
>
>  4. So, when you say perf drop when packet size larger than 512B,
> this is most likely caused by memcpy in ARM not working well
> with this patch
>
> I'm not saying glibc's memcpy is not good enough, it's just that
> this is a rather special use case. And since we see specialized
> memcpy + this patch give better performance than other combinations
> significantly on x86, we suggest to hand-craft a specialized memcpy
> for it.
>
> Of course on ARM this is still just my speculation, and we need to
> either prove it or find the actual root cause.
>
> It can be **REALLY HELPFUL** if you could help to test this patch on
> ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
> to ARM at all, since mrg_rxbuf=on the more widely used cases.
>
Actually it's worse than mrg_rxbuf=off.


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-26 Thread Jianbo Liu
On 26 September 2016 at 13:25, Wang, Zhihong  wrote:
>
>
>> -Original Message-----
>> From: Jianbo Liu [mailto:jianbo.liu at linaro.org]
>> Sent: Monday, September 26, 2016 1:13 PM
>> To: Wang, Zhihong 
>> Cc: Thomas Monjalon ; dev at dpdk.org; Yuanhan
>> Liu ; Maxime Coquelin
>> 
>> Subject: Re: [dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue
>>
>> On 25 September 2016 at 13:41, Wang, Zhihong 
>> wrote:
>> >
>> >
>> >> -Original Message-
>> >> From: Thomas Monjalon [mailto:thomas.monjalon at 6wind.com]
>> >> Sent: Friday, September 23, 2016 9:41 PM
>> >> To: Jianbo Liu 
>> >> Cc: dev at dpdk.org; Wang, Zhihong ; Yuanhan 
>> >> Liu
>> >> ; Maxime Coquelin
>> >> 
>> 
>> > This patch does help in ARM for small packets like 64B sized ones,
>> > this actually proves the similarity between x86 and ARM in terms
>> > of caching optimization in this patch.
>> >
>> > My estimation is based on:
>> >
>> >  1. The last patch are for mrg_rxbuf=on, and since you said it helps
>> > perf, we can ignore it for now when we discuss mrg_rxbuf=off
>> >
>> >  2. Vhost enqueue perf =
>> > Ring overhead + Virtio header overhead + Data memcpy overhead
>> >
>> >  3. This patch helps small packets traffic, which means it helps
>> > ring + virtio header operations
>> >
>> >  4. So, when you say perf drop when packet size larger than 512B,
>> > this is most likely caused by memcpy in ARM not working well
>> > with this patch
>> >
>> > I'm not saying glibc's memcpy is not good enough, it's just that
>> > this is a rather special use case. And since we see specialized
>> > memcpy + this patch give better performance than other combinations
>> > significantly on x86, we suggest to hand-craft a specialized memcpy
>> > for it.
>> >
>> > Of course on ARM this is still just my speculation, and we need to
>> > either prove it or find the actual root cause.
>> >
>> > It can be **REALLY HELPFUL** if you could help to test this patch on
>> > ARM for mrg_rxbuf=on cases to see if this patch is in fact helpful
>> > to ARM at all, since mrg_rxbuf=on the more widely used cases.
>> >
>> Actually it's worse than mrg_rxbuf=off.
>
> I mean compare the perf of original vs. original + patch with
> mrg_rxbuf turned on. Is there any perf improvement?
>
Yes, orig + patch + on is better than orig + on, but orig + patch + on
is worse than orig + patch + off.


[dpdk-dev] [PATCH v3 0/5] vhost: optimize enqueue

2016-09-26 Thread Jianbo Liu
On 26 September 2016 at 13:37, Luke Gorrie  wrote:
> On 22 September 2016 at 11:01, Jianbo Liu  wrote:
>>
>> Tested with testpmd, host: txonly, guest: rxonly
>> size (bytes) improvement (%)
>> 644.12
>> 128   6
>> 256   2.65
>> 512   -1.12
>> 1024 -7.02
>
>
> Have you considered testing with more diverse workloads e.g. mixed packet
> sizes that are not always multiples of the cache line & register sizes?
>
No. Does testpmd can stress performance with mixed size?


Re: [dpdk-dev] [PATCH v2 3/6] eal/arm64: rte pause implementation for arm64

2017-06-05 Thread Jianbo Liu
On 5 June 2017 at 16:58, Jerin Jacob  wrote:
> CC: Jianbo Liu 
> Signed-off-by: Jerin Jacob 
> ---
> v2:
> - Removed YEILD instruction comment, as it is an implementation 
> specific(Jianbo)
> ---
>  lib/librte_eal/common/include/arch/arm/rte_pause.h |  4 ++
>  .../common/include/arch/arm/rte_pause_64.h | 52 
> ++
>  2 files changed, 56 insertions(+)
>  create mode 100644 lib/librte_eal/common/include/arch/arm/rte_pause_64.h
>
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause.h 
> b/lib/librte_eal/common/include/arch/arm/rte_pause.h
> index 0fe88aba9..9b79405e6 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause.h
> @@ -37,7 +37,11 @@
>  extern "C" {
>  #endif
>
> +#ifdef RTE_ARCH_64
> +#include 
> +#else
>  #include 
> +#endif
>
>  #ifdef __cplusplus
>  }
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> new file mode 100644
> index 0..4101553e2
> --- /dev/null
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> @@ -0,0 +1,52 @@
> +/*
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2017 Cavium. All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + *   notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + *   notice, this list of conditions and the following disclaimer in
> + *   the documentation and/or other materials provided with the
> + *   distribution.
> + * * Neither the name of Cavium nor the names of its
> + *   contributors may be used to endorse or promote products derived
> + *   from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _RTE_PAUSE_ARM64_H_
> +#define _RTE_PAUSE_ARM64_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include 
> +#include "generic/rte_pause.h"
> +
> +static inline void rte_pause(void)
> +{
> +   asm volatile("yield" ::: "memory");
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PAUSE_ARM64_H_ */
> --
> 2.13.0
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH v7 0/3] net/i40e: configurable PTYPE mapping

2017-04-04 Thread Jianbo Liu
On 5 April 2017 at 04:22, Thomas Monjalon  wrote:
> 2017-04-04 10:30, Ferruh Yigit:
>> On 4/4/2017 2:55 AM, Qi Zhang wrote:
>> > The patch set create new APIs that help to change the mapping from hardware
>> > defined packet type to software defined packet type for i40e NICs.
>> > Keep these APIs private is because currently they are only meaningful for
>> > devices that support dynamic PTYPE configuration, which may not be a 
>> > general
>> > device feature.
> [...]
>> > Qi Zhang (3):
>> >   net/i40e: enable per dev PTYPE mapping table
>> >   net/i40e: configurable PTYPE mapping
>> >   app/testpmd: add CL for ptype mapping configure
>>
>> Series applied to dpdk-next-net/master, thanks.
>
> It does not compile for ARM (not tested for POWER):
> drivers/net/i40e/i40e_rxtx_vec_neon.c: In function 
> '_recv_raw_pkts_vec':
> 229:2: error: unknown type name 'uint32'
> uint32 *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
>

It must be uint32_t.

> Given that
> - it changes NEON and Altivec code
> - it does not compile on ARM
> - there is no Ack from NEON or Altivec maintainers (they were not 
> Cc'ed)
> I really doubt it has been tested.
> That's why it won't be in RC1.
>
> If NEON and Altivec maintainers agree, we can give it a chance for RC2.
>

Other than the above error on ARM:
Acked-by: Jianbo Liu 

> PS: please use --in-reply-to to let us check the discussion history.


Re: [dpdk-dev] [PATCH 0/2] reduce writes in i40e driver

2017-04-05 Thread Jianbo Liu
On 3 April 2017 at 22:39, Bruce Richardson  wrote:
> this set is based upon Olivier's mbuf rework patchset, and makes some
> improvement to the i40e driver taking account of the rework. It also
> removes a build-time option that seems unnecessary.
>
> Bruce Richardson (2):
>   net/i40e: eliminate mbuf write on rearm
>   net/i40e: remove option to disable offload flags
>
>  config/common_base  |  1 -
>  doc/guides/nics/i40e.rst|  5 
>  drivers/net/i40e/i40e_rxtx_vec_common.h |  8 --
>  drivers/net/i40e/i40e_rxtx_vec_neon.c   | 11 
>  drivers/net/i40e/i40e_rxtx_vec_sse.c| 50 
> -
>  5 files changed, 24 insertions(+), 51 deletions(-)
>
> --
> 2.9.3
>


Re: [dpdk-dev] [PATCH 0/2] reduce writes in i40e driver

2017-04-05 Thread Jianbo Liu
On 3 April 2017 at 22:39, Bruce Richardson  wrote:
> this set is based upon Olivier's mbuf rework patchset, and makes some
> improvement to the i40e driver taking account of the rework. It also
> removes a build-time option that seems unnecessary.
>
> Bruce Richardson (2):
>   net/i40e: eliminate mbuf write on rearm
>   net/i40e: remove option to disable offload flags
>
>  config/common_base  |  1 -
>  doc/guides/nics/i40e.rst|  5 
>  drivers/net/i40e/i40e_rxtx_vec_common.h |  8 --
>  drivers/net/i40e/i40e_rxtx_vec_neon.c   | 11 
>  drivers/net/i40e/i40e_rxtx_vec_sse.c| 50 
> -
>  5 files changed, 24 insertions(+), 51 deletions(-)

Acked-by: Jianbo Liu 

And I'll send a patch to do the same change for i40e neon implementation.


[dpdk-dev] [PATCH] net/i40e: sync the changes for vector PMD between x86 and arm64

2017-04-06 Thread Jianbo Liu
Porting two changes from x86 SSE implematation.
net/i40e: fix checksum flag in x86 vector Rx
net/i40e: eliminate mbuf write on rearm

Signed-off-by: Jianbo Liu 
---
 drivers/net/i40e/i40e_rxtx_vec_neon.c | 68 +--
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c 
b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index 902fb1f..ca6b1f4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -57,7 +57,6 @@
uint64x2_t dma_addr0, dma_addr1;
uint64x2_t zero = vdupq_n_u64(0);
uint64_t paddr;
-   uint8x8_t p;
 
rxdp = rxq->rx_ring + rxq->rxrearm_start;
 
@@ -77,27 +76,17 @@
return;
}
 
-   p = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
-
/* Initialize the mbufs in vector, process 2 mbufs in one loop */
for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
mb0 = rxep[0].mbuf;
mb1 = rxep[1].mbuf;
 
-/* Flush mbuf with pkt template.
-* Data to be rearmed is 6 bytes long.
-* Though, RX will overwrite ol_flags that are coming next
-* anyway. So overwrite whole 8 bytes with one load:
-* 6 bytes of rearm_data plus first 2 bytes of ol_flags.
-*/
-   vst1_u8((uint8_t *)&mb0->rearm_data, p);
paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
dma_addr0 = vdupq_n_u64(paddr);
 
/* flush desc with pa dma_addr */
vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
 
-   vst1_u8((uint8_t *)&mb1->rearm_data, p);
paddr = mb1->buf_physaddr + RTE_PKTMBUF_HEADROOM;
dma_addr1 = vdupq_n_u64(paddr);
vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
@@ -117,9 +106,12 @@
 }
 
 static inline void
-desc_to_olflags_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
+desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4],
+ struct rte_mbuf **rx_pkts)
 {
uint32x4_t vlan0, vlan1, rss, l3_l4e;
+   const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
+   uint64x2_t rearm0, rearm1, rearm2, rearm3;
 
/* mask everything except RSS, flow director and VLAN flags
 * bit2 is for VLAN tag, bit11 for flow director indication
@@ -128,6 +120,20 @@
const uint32x4_t rss_vlan_msk = {
0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804};
 
+   const uint32x4_t cksum_mask = {
+   PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+   PKT_RX_EIP_CKSUM_BAD,
+   PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+   PKT_RX_EIP_CKSUM_BAD,
+   PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+   PKT_RX_EIP_CKSUM_BAD,
+   PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+   PKT_RX_EIP_CKSUM_BAD};
+
/* map rss and vlan type to rss hash and vlan flag */
const uint8x16_t vlan_flags = {
0, 0, 0, 0,
@@ -142,14 +148,16 @@
0, 0, 0, 0};
 
const uint8x16_t l3_l4e_flags = {
-   0,
-   PKT_RX_IP_CKSUM_BAD,
-   PKT_RX_L4_CKSUM_BAD,
-   PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
-   PKT_RX_EIP_CKSUM_BAD,
-   PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
-   PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
-   PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | 
PKT_RX_IP_CKSUM_BAD,
+   (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+   PKT_RX_IP_CKSUM_BAD >> 1,
+   (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+   (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+   (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+   (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+   (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+PKT_RX_L4_CKSUM_BAD) >> 1,
+   (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+PKT_RX_IP_CKSUM_BAD) >> 1,
0, 0, 0, 0, 0, 0, 0, 0};
 
vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
@@ -169,15 +177,23 @@
 

Re: [dpdk-dev] [PATCH] config: set cache line as 128B in the generic arm64 config

2017-04-27 Thread Jianbo Liu
On 27 April 2017 at 00:29, Jerin Jacob  wrote:
> armv8 implementations may have 64B or 128B cache line.
> Setting to the maximum available cache line size in generic config to
> address minimum DMA alignment across all arm64 implementations.
>
> Increasing the cacheline size has no negative impact to cache invalidation
> on systems with a smaller cache line.
>
> The need for the minimum DMA alignment has impact on functional aspects
> of the platform so default config should cater the functional aspects.
>
> There is an impact on memory usage with this scheme, but that's not too
> important for the single image arm64 distribution use case.
>
> The arm64 linux kernel followed the similar approach for single
> arm64 image use case.
> http://lxr.free-electrons.com/source/arch/arm64/include/asm/cache.h
>
> Signed-off-by: Jerin Jacob 
> ---
>  config/defconfig_arm64-armv8a-linuxapp-gcc | 5 +
>  config/defconfig_arm64-dpaa2-linuxapp-gcc  | 1 +
>  config/defconfig_arm64-xgene1-linuxapp-gcc | 1 +
>  3 files changed, 7 insertions(+)
>
> diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc 
> b/config/defconfig_arm64-armv8a-linuxapp-gcc
> index 65888cef1..0faa2d3a3 100644
> --- a/config/defconfig_arm64-armv8a-linuxapp-gcc
> +++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
> @@ -42,6 +42,11 @@ CONFIG_RTE_FORCE_INTRINSICS=y
>  CONFIG_RTE_TOOLCHAIN="gcc"
>  CONFIG_RTE_TOOLCHAIN_GCC=y
>
> +#Maximum available cache line size in arm64 implementations. Setting to 
> maximum
> +#available cache line size in generic config to address minimum DMA alignment
> +#across all arm64 implementations.
> +CONFIG_RTE_CACHE_LINE_SIZE=128
> +
>  CONFIG_RTE_EAL_IGB_UIO=n
>
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc 
> b/config/defconfig_arm64-dpaa2-linuxapp-gcc
> index 010cb8092..314a0eceb 100644
> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc
> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc
> @@ -41,6 +41,7 @@ CONFIG_RTE_ARCH_ARM_TUNE="cortex-a57+fp+simd"
>  #
>  CONFIG_RTE_MAX_LCORE=8
>  CONFIG_RTE_MAX_NUMA_NODES=1
> +CONFIG_RTE_CACHE_LINE_SIZE=64
>
>  CONFIG_RTE_PKTMBUF_HEADROOM=256
>
> diff --git a/config/defconfig_arm64-xgene1-linuxapp-gcc 
> b/config/defconfig_arm64-xgene1-linuxapp-gcc
> index f096166b7..d8e544728 100644
> --- a/config/defconfig_arm64-xgene1-linuxapp-gcc
> +++ b/config/defconfig_arm64-xgene1-linuxapp-gcc
> @@ -32,3 +32,4 @@
>  #include "defconfig_arm64-armv8a-linuxapp-gcc"
>
>  CONFIG_RTE_MACHINE="xgene1"
> +CONFIG_RTE_CACHE_LINE_SIZE=64
> --

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH v2] hash: add neon support for thash

2017-04-27 Thread Jianbo Liu
On 27 April 2017 at 20:33, Ashwin Sekhar T K
 wrote:
> Verified the changes with thash_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K 
> ---
> v2:
> * Slightly modified the content of the commit message body
> * Added prefix [dpdk-dev] to the email subject line
>
>  lib/librte_hash/rte_thash.h | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/lib/librte_hash/rte_thash.h b/lib/librte_hash/rte_thash.h
> index a4886a8..60d58c6 100644
> --- a/lib/librte_hash/rte_thash.h
> +++ b/lib/librte_hash/rte_thash.h
> @@ -56,7 +56,7 @@ extern "C" {
>  #include 
>  #include 
>
> -#ifdef __SSE3__
> +#if defined(__SSE3__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  #include 
>  #endif
>
> @@ -176,6 +176,11 @@ rte_thash_load_v6_addrs(const struct ipv6_hdr *orig, 
> union rte_thash_tuple *targ
> ipv6 = _mm_loadu_si128((const __m128i *)orig->dst_addr);
> *(__m128i *)targ->v6.dst_addr =
> _mm_shuffle_epi8(ipv6, rte_thash_ipv6_bswap_mask);
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +   uint8x16_t ipv6 = vld1q_u8((uint8_t const *)orig->src_addr);
> +   vst1q_u8((uint8_t *)targ->v6.src_addr, vrev32q_u8(ipv6));
> +   ipv6 = vld1q_u8((uint8_t const *)orig->dst_addr);
> +   vst1q_u8((uint8_t *)targ->v6.dst_addr, vrev32q_u8(ipv6));
>  #else
> int i;
> for (i = 0; i < 4; i++) {
> --
> 2.7.4
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH] sched: enable neon optimizations

2017-04-27 Thread Jianbo Liu
On 27 April 2017 at 21:00, Ashwin Sekhar T K
 wrote:
> * Enabled CONFIG_RTE_SCHED_VECTOR for arm64
> * Verified the changes with sched_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K 
> ---
>  config/defconfig_arm64-armv8a-linuxapp-gcc |  2 +-
>  lib/librte_sched/rte_sched.c   | 22 ++
>  2 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc 
> b/config/defconfig_arm64-armv8a-linuxapp-gcc
> index 65888ce..021044a 100644
> --- a/config/defconfig_arm64-armv8a-linuxapp-gcc
> +++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
> @@ -48,4 +48,4 @@ CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
>
> -CONFIG_RTE_SCHED_VECTOR=n
> +CONFIG_RTE_SCHED_VECTOR=y

It's enough to remove this line only, I don't think you must enable it
explicitly in the armv8a common config.

> diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> index 614705d..4ba476a 100644
> --- a/lib/librte_sched/rte_sched.c
> +++ b/lib/librte_sched/rte_sched.c
> @@ -58,6 +58,8 @@
>
>  #if defined(__SSE4__)
>  #define SCHED_VECTOR_SSE4
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#define SCHED_VECTOR_NEON
>  #endif
>
>  #endif
> @@ -1732,6 +1734,26 @@ grinder_pipe_exists(struct rte_sched_port *port, 
> uint32_t base_pipe)
> return 1;
>  }
>
> +#elif defined(SCHED_VECTOR_NEON)
> +
> +static inline int
> +grinder_pipe_exists(struct rte_sched_port *port, uint32_t base_pipe)
> +{
> +   uint32x4_t index, pipes;
> +   uint32_t *pos = (uint32_t *)port->grinder_base_bmp_pos;
> +
> +   index = vmovq_n_u32(base_pipe);
> +   pipes = vld1q_u32(pos);
> +   if (!vminvq_u32(veorq_u32(pipes, index)))
> +   return 1;
> +
> +   pipes = vld1q_u32(pos + 4);
> +   if (!vminvq_u32(veorq_u32(pipes, index)))
> +   return 1;
> +
> +   return 0;
> +}
> +
>  #else
>
>  static inline int
> --
> 2.7.4
>


Re: [dpdk-dev] [PATCH] sched: enable neon optimizations

2017-04-27 Thread Jianbo Liu
On 28 April 2017 at 13:27, Sekhar, Ashwin  wrote:
> On Friday 28 April 2017 09:20 AM, Jianbo Liu wrote:
>> On 27 April 2017 at 21:00, Ashwin Sekhar T K
>>  wrote:
>>> * Enabled CONFIG_RTE_SCHED_VECTOR for arm64
>>> * Verified the changes with sched_autotest unit test case
>>>
>>> Signed-off-by: Ashwin Sekhar T K 
>>> ---
>>>  config/defconfig_arm64-armv8a-linuxapp-gcc |  2 +-
>>>  lib/librte_sched/rte_sched.c   | 22 ++
>>>  2 files changed, 23 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc 
>>> b/config/defconfig_arm64-armv8a-linuxapp-gcc
>>> index 65888ce..021044a 100644
>>> --- a/config/defconfig_arm64-armv8a-linuxapp-gcc
>>> +++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
>>> @@ -48,4 +48,4 @@ CONFIG_RTE_LIBRTE_FM10K_PMD=n
>>>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>>>  CONFIG_RTE_LIBRTE_AVP_PMD=n
>>>
>>> -CONFIG_RTE_SCHED_VECTOR=n
>>> +CONFIG_RTE_SCHED_VECTOR=y
>>
>> It's enough to remove this line only, I don't think you must enable it
>> explicitly in the armv8a common config.
>>
> Tried removing this line from armv8a config. But in that case
> RTE_SCHED_VECTOR doesn't get defined.
> ./config/common_base has "CONFIG_RTE_SCHED_VECTOR=n" as the default
> setting. So enabling explicitly is required.
>

I know it must be enabled to use your enhancement. But I meant to keep
the same as common_base (or other default configs) if there is no
other strange reason to enable it.

Thanks!
Jianbo


Re: [dpdk-dev] [PATCH v2] sched: add neon optimizations

2017-04-28 Thread Jianbo Liu
On 28 April 2017 at 14:15, Ashwin Sekhar T K
 wrote:
> * Removed setting CONFIG_RTE_SCHED_VECTOR=n from armv8a config
>   so that the setting from common_base is taken as the default
>   setting for armv8a
> * Verified the changes with sched_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K 
> ---
> v2:
> * Removed line CONFIG_RTE_SCHED_VECTOR=n from 
> config/defconfig_arm64-armv8a-linuxapp-gcc
> * Modified the commit message and body to reflect the changes
>
>  config/defconfig_arm64-armv8a-linuxapp-gcc |  2 --
>  lib/librte_sched/rte_sched.c   | 22 ++
>  2 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc 
> b/config/defconfig_arm64-armv8a-linuxapp-gcc
> index 65888ce..6415f46 100644
> --- a/config/defconfig_arm64-armv8a-linuxapp-gcc
> +++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
> @@ -47,5 +47,3 @@ CONFIG_RTE_EAL_IGB_UIO=n
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
> -
> -CONFIG_RTE_SCHED_VECTOR=n
> diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> index 614705d..4ba476a 100644
> --- a/lib/librte_sched/rte_sched.c
> +++ b/lib/librte_sched/rte_sched.c
> @@ -58,6 +58,8 @@
>
>  #if defined(__SSE4__)
>  #define SCHED_VECTOR_SSE4
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#define SCHED_VECTOR_NEON
>  #endif
>
>  #endif
> @@ -1732,6 +1734,26 @@ grinder_pipe_exists(struct rte_sched_port *port, 
> uint32_t base_pipe)
> return 1;
>  }
>
> +#elif defined(SCHED_VECTOR_NEON)
> +
> +static inline int
> +grinder_pipe_exists(struct rte_sched_port *port, uint32_t base_pipe)
> +{
> +   uint32x4_t index, pipes;
> +   uint32_t *pos = (uint32_t *)port->grinder_base_bmp_pos;
> +
> +   index = vmovq_n_u32(base_pipe);
> +   pipes = vld1q_u32(pos);
> +   if (!vminvq_u32(veorq_u32(pipes, index)))
> +   return 1;
> +
> +   pipes = vld1q_u32(pos + 4);
> +   if (!vminvq_u32(veorq_u32(pipes, index)))
> +   return 1;
> +
> +   return 0;
> +}
> +
>  #else
>
>  static inline int
> --
> 2.7.4
>

Acked-by: Jianbo Liu 


Re: [dpdk-dev] [PATCH v2] efd: support lookup using neon intrinsics

2017-04-28 Thread Jianbo Liu
On 27 April 2017 at 20:44, Ashwin Sekhar T K
 wrote:
> * Added file lib/librte_efd/rte_efd_arm64.h to hold arm64
>   specific definitions
> * Verified the changes with efd_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K 
> ---
> v2:
> * Slightly modified the content of the commit message body
> * Added prefix [dpdk-dev] to the email subject line
>
>  MAINTAINERS|  1 +
>  lib/librte_efd/rte_efd.c   | 22 
>  lib/librte_efd/rte_efd_arm64.h | 76 
> ++
>  3 files changed, 99 insertions(+)
>  create mode 100644 lib/librte_efd/rte_efd_arm64.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index b6495d2..7d708ae 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
>  F: lib/librte_acl/acl_run_neon.*
>  F: lib/librte_lpm/rte_lpm_neon.h
>  F: lib/librte_hash/rte*_arm64.h
> +F: lib/librte_efd/rte*_arm64.h
>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
> diff --git a/lib/librte_efd/rte_efd.c b/lib/librte_efd/rte_efd.c
> index f601d62..4d9a088 100644
> --- a/lib/librte_efd/rte_efd.c
> +++ b/lib/librte_efd/rte_efd.c
> @@ -53,6 +53,8 @@
>  #include "rte_efd.h"
>  #if defined(RTE_ARCH_X86)
>  #include "rte_efd_x86.h"
> +#elif defined(RTE_ARCH_ARM64)
> +#include "rte_efd_arm64.h"
>  #endif
>
>  #define EFD_KEY(key_idx, table) (table->keys + ((key_idx) * table->key_len))
> @@ -103,6 +105,7 @@ allocated memory
>  enum efd_lookup_internal_function {
> EFD_LOOKUP_SCALAR = 0,
> EFD_LOOKUP_AVX2,
> +   EFD_LOOKUP_NEON,

Should it be included in "if defined(RTE_ARCH_ARM64)"?

> EFD_LOOKUP_NUM
>  };
>
> @@ -674,6 +677,16 @@ rte_efd_create(const char *name, uint32_t max_num_rules, 
> uint32_t key_len,
> table->lookup_fn = EFD_LOOKUP_AVX2;
> else
>  #endif
> +#if defined(RTE_ARCH_ARM64)
> +   /*
> +* For less than or equal to 16 bits, scalar function performs better
> +* than vectorised version
> +*/
> +   if (RTE_EFD_VALUE_NUM_BITS > 16 &&
> +   rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
> +   table->lookup_fn = EFD_LOOKUP_NEON;
> +   else
> +#endif
> table->lookup_fn = EFD_LOOKUP_SCALAR;
>
> /*
> @@ -1271,6 +1284,15 @@ efd_lookup_internal(const struct 
> efd_online_group_entry * const group,
> group->lookup_table,
> hash_val_a,
> hash_val_b);
> +   break;
> +#endif
> +#if defined(RTE_ARCH_ARM64)
> +   case EFD_LOOKUP_NEON:
> +   return efd_lookup_internal_neon(group->hash_idx,
> +   group->lookup_table,
> +   hash_val_a,
> +   hash_val_b);
> +   break;
>  #endif
> case EFD_LOOKUP_SCALAR:
> /* Fall-through */
> diff --git a/lib/librte_efd/rte_efd_arm64.h b/lib/librte_efd/rte_efd_arm64.h
> new file mode 100644
> index 000..cc93411
> --- /dev/null
> +++ b/lib/librte_efd/rte_efd_arm64.h
> @@ -0,0 +1,76 @@
> +/*
> + *   BSD LICENSE
> + *
> + *   Copyright (C) Cavium networks Ltd. 2017.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + *   notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + *   notice, this list of conditions and the following disclaimer in
> + *   the documentation and/or other materials provided with the
> + *   distribution.
> + * * Neither the name of Cavium networks nor the names of its
> + *   contributors may be used to endorse or promote products derived
> + *   from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILI

Re: [dpdk-dev] [PATCH v2] efd: support lookup using neon intrinsics

2017-05-01 Thread Jianbo Liu
On 28 April 2017 at 18:38, Sekhar, Ashwin  wrote:
> On Friday 28 April 2017 03:36 PM, Jianbo Liu wrote:
>> On 27 April 2017 at 20:44, Ashwin Sekhar T K
>>  wrote:
>>> * Added file lib/librte_efd/rte_efd_arm64.h to hold arm64
>>>   specific definitions
>>> * Verified the changes with efd_autotest unit test case
>>>
>>> Signed-off-by: Ashwin Sekhar T K 
>>> ---
>>> v2:
>>> * Slightly modified the content of the commit message body
>>> * Added prefix [dpdk-dev] to the email subject line
>>>
>>>  MAINTAINERS|  1 +
>>>  lib/librte_efd/rte_efd.c   | 22 
>>>  lib/librte_efd/rte_efd_arm64.h | 76 
>>> ++
>>>  3 files changed, 99 insertions(+)
>>>  create mode 100644 lib/librte_efd/rte_efd_arm64.h
>>>
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index b6495d2..7d708ae 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
>>>  F: lib/librte_acl/acl_run_neon.*
>>>  F: lib/librte_lpm/rte_lpm_neon.h
>>>  F: lib/librte_hash/rte*_arm64.h
>>> +F: lib/librte_efd/rte*_arm64.h
>>>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>>>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>>>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
>>> diff --git a/lib/librte_efd/rte_efd.c b/lib/librte_efd/rte_efd.c
>>> index f601d62..4d9a088 100644
>>> --- a/lib/librte_efd/rte_efd.c
>>> +++ b/lib/librte_efd/rte_efd.c
>>> @@ -53,6 +53,8 @@
>>>  #include "rte_efd.h"
>>>  #if defined(RTE_ARCH_X86)
>>>  #include "rte_efd_x86.h"
>>> +#elif defined(RTE_ARCH_ARM64)
>>> +#include "rte_efd_arm64.h"
>>>  #endif
>>>
>>>  #define EFD_KEY(key_idx, table) (table->keys + ((key_idx) * 
>>> table->key_len))
>>> @@ -103,6 +105,7 @@ allocated memory
>>>  enum efd_lookup_internal_function {
>>> EFD_LOOKUP_SCALAR = 0,
>>> EFD_LOOKUP_AVX2,
>>> +   EFD_LOOKUP_NEON,
>>
>> Should it be included in "if defined(RTE_ARCH_ARM64)"?
>>
> The enum can be wrapped under "if defined(RTE_ARCH_ARM64)" with no
> issues, as all its usages are also under "if defined(RTE_ARCH_ARM64)".
> I followed EFD_LOOKUP_AVX2 and defined EFD_LOOKUP_NEON on the same lines.
> Please advise on whether this change is to be made. Will follow your advice.

Yes, please do that.


[dpdk-dev] [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup

2017-05-02 Thread Jianbo Liu
Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h | 302 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +--
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+   uint8_t portid, uint16_t dst_port[8])
+{
+   int32_t ret[8];
+   union ipv4_5tuple_host key[8];
+
+   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+   &key[4], &key[5], &key[6], &key[7]};
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[0]]);
+   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[1]]);
+   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[2]]);
+   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[3]]);
+   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[4]]);
+   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[5]]);
+   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[6]]);
+   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[7]]);
+
+   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[0]) == 0)
+   dst_port[0] = portid;
+
+   if (dst_port[

[dpdk-dev] [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h

2017-05-02 Thread Jianbo Liu
The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for single hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c| 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_single.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_single.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..cccf797 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_single.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_single.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_single.h
-- 
1.8.3.1



[dpdk-dev] [PATCH 4/5] examples/l3fwd: rearrange the code for lpm_l3fwd

2017-05-02 Thread Jianbo Liu
Signed-off-by: Jianbo Liu 

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c | 83 ++
 examples/l3fwd/l3fwd_lpm.h | 26 +
 examples/l3fwd/l3fwd_lpm_sse.h | 66 -
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm *ipv4_l3fwd_lookup_struct =
+   (struct rte_lpm *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+   &next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+   (struct rte_lpm6 *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+   &next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+   uint8_t portid)
+{
+   struct ipv6_hdr *ipv6_hdr;
+   struct ipv4_hdr *ipv4_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+qconf->ipv4_lookup_struct);
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+qconf->ipv6_lookup_struct);
+   }
+
+   return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf 
*pkt,
+   uint32_t dst_ipv4, uint8_t portid)
+{
+   uint32_t next_hop;
+   struct ipv6_hdr *ipv6_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+   return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+  dst_ipv4, &next_hop) == 0)
+  ? next_hop : portid);
+
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+   ipv6_hdr->dst_addr, &next_hop) == 0)
+   ? next_hop : portid);
+
+   }
+
+   return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm *ipv4_l3fwd_lookup_struct =
-   (struct rte_lpm *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-   &next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-   (struct rte_lpm6 *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-   &next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((a

[dpdk-dev] [PATCH 5/5] examples/l3fwd: add neon support for l3fwd

2017-05-02 Thread Jianbo Liu
Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd.h |   4 -
 examples/l3fwd/l3fwd_em.c  |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h  |   5 +
 examples/l3fwd/l3fwd_em_hlm_neon.h |  74 +++
 examples/l3fwd/l3fwd_em_single.h   |   4 +
 examples/l3fwd/l3fwd_lpm.c |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h| 157 ++
 examples/l3fwd/l3fwd_neon.h| 259 +
 8 files changed, 504 insertions(+), 7 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 011ba14..c45589a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,10 +40,6 @@
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
-#if !defined(NO_HASH_MULTI_LOOKUP) && defined(RTE_MACHINE_CPUFLAG_NEON)
-#define NO_HASH_MULTI_LOOKUP 1
-#endif
-
 #define MAX_PKT_BURST 32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index cccf797..ac1e2e0 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_single.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
if (nb_rx == 0)
continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
l3fwd_em_send_packets(nb_rx, pkts_burst,
portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..3329c1a 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h 
b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include 
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+   union ipv4_5tuple_host *key)
+{
+   int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+   sizeof(struct ether_hdr) +
+   offsetof(struct ipv4_hdr, time_to_live)));
+
+   key->xmm = vandq_s32(t

[dpdk-dev] [PATCH 3/5] examples/l3fwd: extract common code from multi packet send

2017-05-02 Thread Jianbo Liu
Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_common.h | 293 ++
 examples/l3fwd/l3fwd_sse.h| 255 +---
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#defineIPV4_MIN_VER_IHL0x45
+#defineIPV4_MAX_VER_IHL0x4f
+#defineIPV4_MAX_VER_IHL_DIFF   (IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#defineIPV4_MIN_LEN_BE (sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+   uint8_t ihl;
+
+   if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+   ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+   ipv4_hdr->time_to_live--;
+   ipv4_hdr->hdr_checksum++;
+
+   if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+   ((uint8_t)ipv4_hdr->total_length == 0 &&
+   ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+   dp[0] = BAD_PORT;
+
+   }
+}
+
+#else
+#definerfc1812_process(mb, dp, ptype)  do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#defineGRPSZ   (1 << FWDSTEP)
+#defineGRPMSK  (GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
+   if (likely((dlp) == (dcp)[(idx)])) { \
+   (lp)[0]++;   \
+   } else { \
+   (dlp) = (dcp)[idx];  \
+   (lp) = (pn) + (idx

Re: [dpdk-dev] [PATCH v3] efd: support lookup using neon intrinsics

2017-05-02 Thread Jianbo Liu
On 2 May 2017 at 14:41, Jerin Jacob  wrote:
> -Original Message-
>> Date: Mon,  1 May 2017 22:59:53 -0700
>> From: Ashwin Sekhar T K 
>> To: byron.mar...@intel.com, pablo.de.lara.gua...@intel.com,
>>  jerin.ja...@caviumnetworks.com, jianbo@linaro.org
>> Cc: dev@dpdk.org, Ashwin Sekhar T K 
>> Subject: [dpdk-dev] [PATCH v3] efd: support lookup using neon intrinsics
>> X-Mailer: git-send-email 2.13.0.rc1
>>
>> * Added file lib/librte_efd/rte_efd_arm64.h to hold arm64
>>   specific definitions
>> * Verified the changes with efd_autotest unit test case
>>
>> Signed-off-by: Ashwin Sekhar T K 
>> ---
>> v2:
>> * Slightly modified the content of the commit message body
>> * Added prefix [dpdk-dev] to the email subject line
>>
>> v3:
>> * Moved enum 'EFD_LOOKUP_NEON' under '#if defined(RTE_ARCH_ARM64)'
>>
>>  MAINTAINERS|  1 +
>>  lib/librte_efd/rte_efd.c   | 24 +
>>  lib/librte_efd/rte_efd_arm64.h | 76 
>> ++
>>  3 files changed, 101 insertions(+)
>>  create mode 100644 lib/librte_efd/rte_efd_arm64.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index b6495d2..7d708ae 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
>>  F: lib/librte_acl/acl_run_neon.*
>>  F: lib/librte_lpm/rte_lpm_neon.h
>>  F: lib/librte_hash/rte*_arm64.h
>> +F: lib/librte_efd/rte*_arm64.h
>>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
>> diff --git a/lib/librte_efd/rte_efd.c b/lib/librte_efd/rte_efd.c
>> index f601d62..5cc6283 100644
>> --- a/lib/librte_efd/rte_efd.c
>> +++ b/lib/librte_efd/rte_efd.c
>> @@ -53,6 +53,8 @@
>>  #include "rte_efd.h"
>>  #if defined(RTE_ARCH_X86)
>>  #include "rte_efd_x86.h"
>> +#elif defined(RTE_ARCH_ARM64)
>> +#include "rte_efd_arm64.h"
>>  #endif
>>
>>  #define EFD_KEY(key_idx, table) (table->keys + ((key_idx) * table->key_len))
>> @@ -103,6 +105,9 @@ allocated memory
>>  enum efd_lookup_internal_function {
>>   EFD_LOOKUP_SCALAR = 0,
>>   EFD_LOOKUP_AVX2,
>> +#if defined(RTE_ARCH_ARM64)
>> + EFD_LOOKUP_NEON,
>> +#endif
>
> I think, we can remove this ifdef to
> - Make code looks clean
> - In future, in some case a new enum value gets added then the value
> will be different for each build.
>

But the enum items are same for each ARCH.
Besides, the ifdef could be considered as explanation to that enum. If
someone knows nothing about arm/neon, he can ignore it totally after
see the ifdef.

> Any valid point to keep under RTE_ARCH_ARM64?
>
>>   EFD_LOOKUP_NUM
>>  };


Re: [dpdk-dev] [PATCH 5/5] examples/l3fwd: add neon support for l3fwd

2017-05-02 Thread Jianbo Liu
Hi Ashwin,

On 2 May 2017 at 19:47, Sekhar, Ashwin  wrote:
> Hi Jianbo,
>
> I tested your neon changes on thunderx. I am seeing a performance
> regression of ~10% for LPM case and ~20% for EM case with your changes.
> Did you see improvement on any arm64 platform with these changes. If
> yes, how much was the improvement?

Thanks for your reviewing and testing.
For some reason, I have not done much with the performance testing.
I'll send a new version later after tuning the performance.

Thanks!
Jianbo

>
> FYI, I had also tried vectorizing the l3fwd app with neon. Few of the
> optimizations that I can suggest that helped in my case.
>
> * Packet data prefetch is missing in the x86 sse version compared to
> the scalar version (l3fwd_lpm_send_packets vs
> l3fwd_lpm_no_opt_send_packets) . I couldn't understand why this was not
> done in x86. But adding the prefetch was improving performance for
> thunderx.
>
> * Offsets to some packet elements like eth_hdr, ip header, packet type
> etc. are recalculated in different functions. Calculating them once,
> caching them and passing them directly to different functions was
> improving performance.
>
> * There are 3 different loops in l3fwd_lpm_send_packets where we
> iterate over the packets. One each for processx4_step1 and
> processx4_step2 and one in send_packets_multi. Unifying these loops
> were also helping.
>
> Thanks and Regards
> Ashwin
>


Re: [dpdk-dev] [PATCH 5/5] examples/l3fwd: add neon support for l3fwd

2017-05-04 Thread Jianbo Liu
Hi Ashwin,

On 3 May 2017 at 13:24, Jianbo Liu  wrote:
> Hi Ashwin,
>
> On 2 May 2017 at 19:47, Sekhar, Ashwin  wrote:
>> Hi Jianbo,
>>
>> I tested your neon changes on thunderx. I am seeing a performance
>> regression of ~10% for LPM case and ~20% for EM case with your changes.
>> Did you see improvement on any arm64 platform with these changes. If
>> yes, how much was the improvement?
>
> Thanks for your reviewing and testing.
> For some reason, I have not done much with the performance testing.
> I'll send a new version later after tuning the performance.
>

Can you tell me how did you test?
My testing shows that EM case is much better, while LPM is almost the
same as before.

Thanks!
Jianbo


Re: [dpdk-dev] [PATCH 5/5] examples/l3fwd: add neon support for l3fwd

2017-05-04 Thread Jianbo Liu
On 5 May 2017 at 12:24, Sekhar, Ashwin  wrote:
> On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
>> Hi Ashwin,
>>
>> On 3 May 2017 at 13:24, Jianbo Liu  wrote:
>> >
>> > Hi Ashwin,
>> >
>> > On 2 May 2017 at 19:47, Sekhar, Ashwin 
>> > wrote:
>> > >
>> > > Hi Jianbo,
>> > >
>> > > I tested your neon changes on thunderx. I am seeing a performance
>> > > regression of ~10% for LPM case and ~20% for EM case with your
>> > > changes.
>> > > Did you see improvement on any arm64 platform with these changes.
>> > > If
>> > > yes, how much was the improvement?
>> > Thanks for your reviewing and testing.
>> > For some reason, I have not done much with the performance testing.
>> > I'll send a new version later after tuning the performance.
>> >
>> Can you tell me how did you test?
> Built with following commands.
> make config T=arm64-thunderx-linuxapp-gcc
> make -j32
>
> Tested LPM with
> sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 
> --config="(0,0,10)"
>
> Tested EM with
> sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p 0x1 
> --config="(0,0,10)" -E
>

Only one port? What's the network topology, and lpm/em rules? How did
you stress traffic...?

>> My testing shows that EM case is much better, while LPM is almost the
>> same as before.
> Could you please tell on which arm64 processor/platform you tested.
> Also how much was the percentage increase in performance for EM ?
>

I'm sorry I can't tell you what's arm64 platform I tested on. But I
can get a ThunderX, and replicate your testing environment if you can
tell me more...

Thanks!
Jianbo


[dpdk-dev] [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform

2017-05-09 Thread Jianbo Liu
v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (7):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard micro name for header file

 examples/l3fwd/l3fwd_common.h  | 293 +
 examples/l3fwd/l3fwd_em.c  |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h  | 220 
 examples/l3fwd/l3fwd_em_hlm_neon.h |  74 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h  | 280 +---
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}  |  26 +-
 examples/l3fwd/l3fwd_lpm.c |  87 +-
 examples/l3fwd/l3fwd_lpm.h |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h| 165 
 examples/l3fwd/l3fwd_lpm_sse.h |  66 -
 examples/l3fwd/l3fwd_neon.h| 259 ++
 examples/l3fwd/l3fwd_sse.h | 255 +-
 12 files changed, 1133 insertions(+), 626 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (86%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1



[dpdk-dev] [PATCH v2 3/7] examples/l3fwd: extract common code from multi packet send

2017-05-09 Thread Jianbo Liu
Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_common.h | 293 ++
 examples/l3fwd/l3fwd_sse.h| 255 +---
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#defineIPV4_MIN_VER_IHL0x45
+#defineIPV4_MAX_VER_IHL0x4f
+#defineIPV4_MAX_VER_IHL_DIFF   (IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#defineIPV4_MIN_LEN_BE (sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+   uint8_t ihl;
+
+   if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+   ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+   ipv4_hdr->time_to_live--;
+   ipv4_hdr->hdr_checksum++;
+
+   if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+   ((uint8_t)ipv4_hdr->total_length == 0 &&
+   ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+   dp[0] = BAD_PORT;
+
+   }
+}
+
+#else
+#definerfc1812_process(mb, dp, ptype)  do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#defineGRPSZ   (1 << FWDSTEP)
+#defineGRPMSK  (GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
+   if (likely((dlp) == (dcp)[(idx)])) { \
+   (lp)[0]++;   \
+   } else { \
+   (dlp) = (dcp)[idx];  \
+   (lp) = (pn) + (idx

[dpdk-dev] [PATCH v2 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h

2017-05-09 Thread Jianbo Liu
The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c| 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h 
b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1



[dpdk-dev] [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup

2017-05-09 Thread Jianbo Liu
Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h | 302 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +--
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+   uint8_t portid, uint16_t dst_port[8])
+{
+   int32_t ret[8];
+   union ipv4_5tuple_host key[8];
+
+   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+   &key[4], &key[5], &key[6], &key[7]};
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[0]]);
+   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[1]]);
+   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[2]]);
+   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[3]]);
+   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[4]]);
+   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[5]]);
+   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[6]]);
+   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[7]]);
+
+   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[0]) == 0)
+   dst_port[0] = portid;
+
+   if (dst_port[

[dpdk-dev] [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file

2017-05-09 Thread Jianbo Liu
As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h 
b/examples/l3fwd/l3fwd_em_sequential.h
index c3df473..63c5c12 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -125,4 +125,4 @@ static inline __attribute__((always_inline)) uint16_t
 
send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1



[dpdk-dev] [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd

2017-05-09 Thread Jianbo Liu
Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c|   4 +-
 examples/l3fwd/l3fwd_em_hlm.h|  19 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++
 examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
 examples/l3fwd/l3fwd_lpm.c   |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h  | 165 ++
 examples/l3fwd/l3fwd_neon.h  | 259 +++
 7 files changed, 539 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
if (nb_rx == 0)
continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
l3fwd_em_send_packets(nb_rx, pkts_burst,
portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..4ec600a 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
uint8_t portid, struct lcore_conf *qconf)
 {
-   int32_t j;
+   int32_t i, j, pos;
uint16_t dst_port[MAX_PKT_BURST];
 
/*
@@ -247,6 +252,12 @@ static inline __attribute__((always_inline)) uint16_t
 */
int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+   for (j = 0; j < 8 && j < nb_rx; j++) {
+   rte_prefetch0(pkts_burst[j]);
+   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+  struct ether_hdr *) + 1);
+   }
+
for (j = 0; j < n; j += 8) {
 
uint32_t pkt_type =
@@ -263,6 +274,12 @@ static inline __attribute__((always_inline)) uint16_t
uint32_t tcp_or_udp = pkt_type &
(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+   for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+   rte_prefetch0(pkts_burst[pos]);
+   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+  struct ether_hdr *) + 1);
+   }
+
if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h 
b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPY

[dpdk-dev] [PATCH v2 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs

2017-05-09 Thread Jianbo Liu
New micro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +-
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 4ec600a..10a9c95 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-   uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+   uint8_t portid, uint16_t dst_port[])
 {
-   int32_t ret[8];
-   union ipv4_5tuple_host key[8];
-
-   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-   &key[4], &key[5], &key[6], &key[7]};
-
-   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[0]]);
-   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[1]]);
-   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[2]]);
-   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[3]]);
-   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[4]]);
-   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[5]]);
-   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[6]]);
-   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[7]]);
-
-   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[0]) == 0)
-   dst_port[0] = portid;
-
-   if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[1]) == 0)
-   dst_port[1] = portid;
-
-   if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[2]) == 0)
-   dst_port[2] = portid;
-
-   if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[3]) == 0)
-   dst_port[3] = portid;
-
-   if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[4]) == 0)
-   dst_port[4] = portid;
-
-   if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[5]) == 0)
-   dst_port[5] = portid;
-
-   if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[6]) == 0)
-   dst_port[6] = portid;
-
-   if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[7]) == 0)
-   dst_port[7] = portid;
+   int i;
+   int32_t ret[EM_HASH_LOOKUP_COUNT];
+   union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+   const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+   get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+   key_array[i] = &key[i];
+   }
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+EM_HASH_LOOKUP_COUNT, ret);
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+   dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[i]]);
 
+   if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[i]) == 0)
+   dst_port[i] = portid;
+   }
 }
 
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-   uint8_t portid, uint16_t dst_port[8])
+

[dpdk-dev] [PATCH v2 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd

2017-05-09 Thread Jianbo Liu
Signed-off-by: Jianbo Liu 

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c | 83 ++
 examples/l3fwd/l3fwd_lpm.h | 26 +
 examples/l3fwd/l3fwd_lpm_sse.h | 66 -
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm *ipv4_l3fwd_lookup_struct =
+   (struct rte_lpm *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+   &next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+   (struct rte_lpm6 *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+   &next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+   uint8_t portid)
+{
+   struct ipv6_hdr *ipv6_hdr;
+   struct ipv4_hdr *ipv4_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+qconf->ipv4_lookup_struct);
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+qconf->ipv6_lookup_struct);
+   }
+
+   return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf 
*pkt,
+   uint32_t dst_ipv4, uint8_t portid)
+{
+   uint32_t next_hop;
+   struct ipv6_hdr *ipv6_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+   return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+  dst_ipv4, &next_hop) == 0)
+  ? next_hop : portid);
+
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+   ipv6_hdr->dst_addr, &next_hop) == 0)
+   ? next_hop : portid);
+
+   }
+
+   return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm *ipv4_l3fwd_lookup_struct =
-   (struct rte_lpm *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-   &next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-   (struct rte_lpm6 *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-   &next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((a

Re: [dpdk-dev] [PATCH 5/5] examples/l3fwd: add neon support for l3fwd

2017-05-09 Thread Jianbo Liu
Hi Ashwin,

On 9 May 2017 at 16:10, Sekhar, Ashwin  wrote:
> On Fri, 2017-05-05 at 13:43 +0800, Jianbo Liu wrote:
>> On 5 May 2017 at 12:24, Sekhar, Ashwin 
>> wrote:
>> >
>> > On Thu, 2017-05-04 at 16:42 +0800, Jianbo Liu wrote:
>> > >
>> > > Hi Ashwin,
>> > >
>> > > On 3 May 2017 at 13:24, Jianbo Liu  wrote:
>> > > >
>> > > >
>> > > > Hi Ashwin,
>> > > >
>> > > > On 2 May 2017 at 19:47, Sekhar, Ashwin > > > > m>
>> > > > wrote:
>> > > > >
>> > > > >
>> > > > > Hi Jianbo,
>> > > > >
>> > > > > I tested your neon changes on thunderx. I am seeing a
>> > > > > performance
>> > > > > regression of ~10% for LPM case and ~20% for EM case with
>> > > > > your
>> > > > > changes.
>> > > > > Did you see improvement on any arm64 platform with these
>> > > > > changes.
>> > > > > If
>> > > > > yes, how much was the improvement?
>> > > > Thanks for your reviewing and testing.
>> > > > For some reason, I have not done much with the performance
>> > > > testing.
>> > > > I'll send a new version later after tuning the performance.
>> > > >
>> > > Can you tell me how did you test?
>> > Built with following commands.
>> > make config T=arm64-thunderx-linuxapp-gcc
>> > make -j32
>> >
>> > Tested LPM with
>> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p
>> > 0x1 --config="(0,0,10)"
>> >
>> > Tested EM with
>> > sudo ./examples/l3fwd/build/l3fwd -l 9,10  --master-lcore 9  -- -p
>> > 0x1 --config="(0,0,10)" -E
>> >
>> Only one port? What's the network topology, and lpm/em rules? How did
>> you stress traffic...?
> port - 1 topology: DUT connected back to back to traffic generator.
>
> We are using the default rules in the C code. flow generation is:
> src.ip.min 192.168.18.1
> src.ip.max 192.168.18.90
> src.ip.inc 1
>
> Also, Please let us know the topology that you are using.

I used two ports with one rule to forward packets from one to the other.
Sent v2, please try this new version.

Thanks!
Jianbo


Re: [dpdk-dev] [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd

2017-05-10 Thread Jianbo Liu
Hi Ashwin,

On 10 May 2017 at 23:00, Sekhar, Ashwin  wrote:
> Hi Jianbo,
>
> Thanks for version v2. Addition of the prefetch instructions is
> definitely helping performance on ThunderX. But still performance is
> slightly less than that of scalar.
>
> I tried few small tweaks which helped improve performance on my
> Thunderx setup. For details see comments inline.
>
>
> On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
>> Use ARM NEON intrinsics to accelerate l3 fowarding.
>>
>> Signed-off-by: Jianbo Liu 
>> ---
>>  examples/l3fwd/l3fwd_em.c|   4 +-
>>  examples/l3fwd/l3fwd_em_hlm.h|  19 ++-
>>  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++
>>  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
>>  examples/l3fwd/l3fwd_lpm.c   |   4 +-
>>  examples/l3fwd/l3fwd_lpm_neon.h  | 165 ++
>>  examples/l3fwd/l3fwd_neon.h  | 259
>> +++
>>  7 files changed, 539 insertions(+), 6 deletions(-)
>>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>>  create mode 100644 examples/l3fwd/l3fwd_neon.h
>>
>> [...]
>> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
>> b/examples/l3fwd/l3fwd_em_hlm.h
>> index 636dea4..4ec600a 100644
>> --- a/examples/l3fwd/l3fwd_em_hlm.h
>> +++ b/examples/l3fwd/l3fwd_em_hlm.h
>> @@ -35,8 +35,13 @@
>>  #ifndef __L3FWD_EM_HLM_H__
>>  #define __L3FWD_EM_HLM_H__
>>
>> +#if defined(__SSE4_1__)
>>  #include "l3fwd_sse.h"
>>  #include "l3fwd_em_hlm_sse.h"
>> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
>> +#include "l3fwd_neon.h"
>> +#include "l3fwd_em_hlm_neon.h"
>> +#endif
>>
>>  static inline __attribute__((always_inline)) void
>>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
>> *m[8],
>> @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))
>> uint16_t
>>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>>   uint8_t portid, struct lcore_conf *qconf)
>>  {
>> - int32_t j;
>> + int32_t i, j, pos;
>>   uint16_t dst_port[MAX_PKT_BURST];
>>
>>   /*
>> @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))
>> uint16_t
>>*/
>>   int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
>>
>> + for (j = 0; j < 8 && j < nb_rx; j++) {
>> + rte_prefetch0(pkts_burst[j]);
> The above prefetch of rte_mbuf struct is unnecessary. With this we wont
> see any performance improvement as the contents of rte_mbuf (buf_addr
> and data_off) is used in right next instruction. Removing the above
> prefetch and similar prefetches at multiple places was improving
> performance on my ThunderX setup.

Yes, will remove them.

>
>> + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
>> +struct ether_hdr *) +
>> 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In
> process_packet in l3fwd_neon.h, eth_header is accessed in
>

But ip headers are used right in each 8/FWDSTEP loop.
Since ip headers are accessed first, we should prefetch eth_hdr + 1 first.
After all nb_rx packets are handled in above small loop, their
eth_header are then accessed in processx4_step3 over again.
I'm not sure prefretching eth_hdr still works if we prefetch eth_hdr
in first step,  as cache may be already filled with new data at that
time.

>> + }
>> +
>>   for (j = 0; j < n; j += 8) {
>>
>>   uint32_t pkt_type =
>> @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))
>> uint16_t
>>   uint32_t tcp_or_udp = pkt_type &
>>   (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
>>
>> + for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++,
>> pos++) {
>> + rte_prefetch0(pkts_burst[pos]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
>> s],
>> +struct
>> ether_hdr *) + 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> + }
>> +
>>   if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>>
>>   em_get_dst_port_ipv4x8(qconf,
>> &pkts_burst[j], portid,
>>
>> [...]
>



>> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
>&

Re: [dpdk-dev] [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd

2017-05-10 Thread Jianbo Liu
On 11 May 2017 at 12:27, Sekhar, Ashwin  wrote:
>
> On Thu, 2017-05-11 at 04:14 +, Sekhar, Ashwin wrote:
> ...
>> > > Combining all the above comments, I made some changes on top of
>> > > your
>> > > patch. These changes are giving 3-4% improvement over your
>> > > version.
>> > >
>> > > You may find the changes at
>> > > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587faf
>> > > c
>> > >
>> > Is the correct in Line 103/104, you only process one packets in the
>> > last FWDSTEP packets?
>> Its doing processx4_* there. So its processing 4 packets.
>>
>> >
>> > Actually, I don't like your change in l3fwd_lpm_send_packets,
>> > making
>> > the simple logic complicated. And I don't think it can help to
>> > improve
>> > performance. :-)
>> Its not making it complicated. The number of lines of code may be
>> higher by may be 10 lines, but the conditions of the loops are
>> simplified which reduces the number of branch instructions and helps
>> the processor to go through them faster.

I suspected not much improvement we can get.

>>
>> If possible, please try it out on your machine.

OK, I'll test. If no performance regression, I'll adopt your suggestion in v3.

>
> Missed out one point.
> Since 2 loops are form "for (i = 0; i < FWDSTEP; i++)" i.e. looping for
> constant number of iterations, compiler will easily unroll them.
>
> Thanks
> Ashwin
>> >
>> >
>> > >
>> > >
>> > > Please check it out and let me know your comments.
>> > >
>> > > Thanks
>> > > Ashwin


[dpdk-dev] [PATCH v3 3/7] examples/l3fwd: extract common code from multi packet send

2017-05-11 Thread Jianbo Liu
Keep x86 related code in l3fwd_sse.h, and move common code to
l3fwd_common.h, which will be used by other Archs.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_common.h | 293 ++
 examples/l3fwd/l3fwd_sse.h| 255 +---
 2 files changed, 297 insertions(+), 251 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
new file mode 100644
index 000..d7a1fdf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_common.h
@@ -0,0 +1,293 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#defineIPV4_MIN_VER_IHL0x45
+#defineIPV4_MAX_VER_IHL0x4f
+#defineIPV4_MAX_VER_IHL_DIFF   (IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#defineIPV4_MIN_LEN_BE (sizeof(struct ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static inline __attribute__((always_inline)) void
+rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+   uint8_t ihl;
+
+   if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+   ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+   ipv4_hdr->time_to_live--;
+   ipv4_hdr->hdr_checksum++;
+
+   if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+   ((uint8_t)ipv4_hdr->total_length == 0 &&
+   ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+   dp[0] = BAD_PORT;
+
+   }
+}
+
+#else
+#definerfc1812_process(mb, dp, ptype)  do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#defineGRPSZ   (1 << FWDSTEP)
+#defineGRPMSK  (GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
+   if (likely((dlp) == (dcp)[(idx)])) { \
+   (lp)[0]++;   \
+   } else { \
+   (dlp) = (dcp)[idx];  \
+   (lp) = (pn) + (idx

[dpdk-dev] [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup

2017-05-11 Thread Jianbo Liu
Extract common code from l3fwd_em_hlm_sse.h, and add to the new file
l3fwd_em_hlm.h.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c |   2 +-
 examples/l3fwd/l3fwd_em_hlm.h | 302 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h | 280 +--
 3 files changed, 309 insertions(+), 275 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9cc4460..939a16d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -332,7 +332,7 @@ struct ipv6_l3fwd_em_route {
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
-#include "l3fwd_em_hlm_sse.h"
+#include "l3fwd_em_hlm.h"
 #endif
 #else
 #include "l3fwd_em.h"
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
new file mode 100644
index 000..636dea4
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -0,0 +1,302 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_H__
+#define __L3FWD_EM_HLM_H__
+
+#include "l3fwd_sse.h"
+#include "l3fwd_em_hlm_sse.h"
+
+static inline __attribute__((always_inline)) void
+em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
+   uint8_t portid, uint16_t dst_port[8])
+{
+   int32_t ret[8];
+   union ipv4_5tuple_host key[8];
+
+   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
+   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
+   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
+   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
+   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
+   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
+   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
+   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
+
+   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
+   &key[4], &key[5], &key[6], &key[7]};
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
+
+   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[0]]);
+   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[1]]);
+   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[2]]);
+   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[3]]);
+   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[4]]);
+   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[5]]);
+   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[6]]);
+   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[7]]);
+
+   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[0]) == 0)
+   dst_port[0] = portid;
+
+   if (dst_port[

[dpdk-dev] [PATCH v3 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd

2017-05-11 Thread Jianbo Liu
Signed-off-by: Jianbo Liu 

Some common code can be used by other ARCHs, move to l3fwd_lpm.c
---
 examples/l3fwd/l3fwd_lpm.c | 83 ++
 examples/l3fwd/l3fwd_lpm.h | 26 +
 examples/l3fwd/l3fwd_lpm_sse.h | 66 -
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index f621269..fc554fc 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -104,6 +104,89 @@ struct ipv6_l3fwd_lpm_route {
 struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
 
+static inline uint16_t
+lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm *ipv4_l3fwd_lookup_struct =
+   (struct rte_lpm *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
+   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
+   &next_hop) == 0) ? next_hop : portid);
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
+{
+   uint32_t next_hop;
+   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
+   (struct rte_lpm6 *)lookup_struct;
+
+   return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
+   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
+   &next_hop) == 0) ?  next_hop : portid);
+}
+
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
+   uint8_t portid)
+{
+   struct ipv6_hdr *ipv6_hdr;
+   struct ipv4_hdr *ipv4_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+qconf->ipv4_lookup_struct);
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+qconf->ipv6_lookup_struct);
+   }
+
+   return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static inline __attribute__((always_inline)) uint16_t
+lpm_get_dst_port_with_ipv4(const struct lcore_conf *qconf, struct rte_mbuf 
*pkt,
+   uint32_t dst_ipv4, uint8_t portid)
+{
+   uint32_t next_hop;
+   struct ipv6_hdr *ipv6_hdr;
+   struct ether_hdr *eth_hdr;
+
+   if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+   return (uint16_t) ((rte_lpm_lookup(qconf->ipv4_lookup_struct,
+  dst_ipv4, &next_hop) == 0)
+  ? next_hop : portid);
+
+   } else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+   ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + 1);
+
+   return (uint16_t) ((rte_lpm6_lookup(qconf->ipv6_lookup_struct,
+   ipv6_hdr->dst_addr, &next_hop) == 0)
+   ? next_hop : portid);
+
+   }
+
+   return portid;
+}
+
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
 #else
diff --git a/examples/l3fwd/l3fwd_lpm.h b/examples/l3fwd/l3fwd_lpm.h
index 258a82f..4865d90 100644
--- a/examples/l3fwd/l3fwd_lpm.h
+++ b/examples/l3fwd/l3fwd_lpm.h
@@ -34,37 +34,13 @@
 #ifndef __L3FWD_LPM_H__
 #define __L3FWD_LPM_H__
 
-static inline uint8_t
-lpm_get_ipv4_dst_port(void *ipv4_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm *ipv4_l3fwd_lookup_struct =
-   (struct rte_lpm *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
-   rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
-   &next_hop) == 0) ? next_hop : portid);
-}
-
-static inline uint8_t
-lpm_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
-{
-   uint32_t next_hop;
-   struct rte_lpm6 *ipv6_l3fwd_lookup_struct =
-   (struct rte_lpm6 *)lookup_struct;
-
-   return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
-   ((struct ipv6_hdr *)ipv6_hdr)->dst_addr,
-   &next_hop) == 0) ?  next_hop : portid);
-}
-
 static inline __attribute__((a

[dpdk-dev] [PATCH v3 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h

2017-05-11 Thread Jianbo Liu
The l3fwd_em_sse.h is enabled by NO_HASH_LOOKUP_MULTI.
Renaming it because it's only for sequential hash lookup,
and doesn't include any x86 SSE instructions.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c| 2 +-
 examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (100%)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 939a16d..ba844b2 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -330,7 +330,7 @@ struct ipv6_l3fwd_em_route {
 
 #if defined(__SSE4_1__)
 #if defined(NO_HASH_MULTI_LOOKUP)
-#include "l3fwd_em_sse.h"
+#include "l3fwd_em_sequential.h"
 #else
 #include "l3fwd_em_hlm.h"
 #endif
diff --git a/examples/l3fwd/l3fwd_em_sse.h 
b/examples/l3fwd/l3fwd_em_sequential.h
similarity index 100%
rename from examples/l3fwd/l3fwd_em_sse.h
rename to examples/l3fwd/l3fwd_em_sequential.h
-- 
1.8.3.1



[dpdk-dev] [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform

2017-05-11 Thread Jianbo Liu
v3:
  - remove unnecessary perfetch for rte_mbuf
  - fix typo in git log
  - Ashwin's suggestions for performance on ThunderX

v2:
  - change name of l3fwd_em_sse.h to l3fwd_em_sequential.h
  - add the times of hash multi-lookup for different Archs
  - performance tuning on ThunderX: prefetching, set NO_HASH_LOOKUP_MULTI ...

Jianbo Liu (7):
  examples/l3fwd: extract arch independent code from multi hash lookup
  examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h
  examples/l3fwd: extract common code from multi packet send
  examples/l3fwd: rearrange the code for lpm_l3fwd
  examples/l3fwd: add neon support for l3fwd
  examples/l3fwd: add the times of hash multi-lookup for different Archs
  examples/l3fwd: change the guard macro name for header file

 examples/l3fwd/l3fwd_common.h  | 293 +
 examples/l3fwd/l3fwd_em.c  |   8 +-
 examples/l3fwd/l3fwd_em_hlm.h  | 218 +++
 examples/l3fwd/l3fwd_em_hlm_neon.h |  74 ++
 examples/l3fwd/l3fwd_em_hlm_sse.h  | 280 +---
 .../{l3fwd_em_sse.h => l3fwd_em_sequential.h}  |  24 +-
 examples/l3fwd/l3fwd_lpm.c |  87 +-
 examples/l3fwd/l3fwd_lpm.h |  26 +-
 examples/l3fwd/l3fwd_lpm_neon.h| 193 ++
 examples/l3fwd/l3fwd_lpm_sse.h |  66 -
 examples/l3fwd/l3fwd_neon.h| 259 ++
 examples/l3fwd/l3fwd_sse.h | 255 +-
 12 files changed, 1157 insertions(+), 626 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_common.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm.h
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 rename examples/l3fwd/{l3fwd_em_sse.h => l3fwd_em_sequential.h} (88%)
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

-- 
1.8.3.1



[dpdk-dev] [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd

2017-05-11 Thread Jianbo Liu
Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em.c|   4 +-
 examples/l3fwd/l3fwd_em_hlm.h|  17 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++
 examples/l3fwd/l3fwd_em_sequential.h |  18 ++-
 examples/l3fwd/l3fwd_lpm.c   |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h  | 193 ++
 examples/l3fwd/l3fwd_neon.h  | 259 +++
 7 files changed, 563 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
if (nb_rx == 0)
continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
l3fwd_em_send_packets(nb_rx, pkts_burst,
portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..b9163e3 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@ static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
uint8_t portid, struct lcore_conf *qconf)
 {
-   int32_t j;
+   int32_t i, j, pos;
uint16_t dst_port[MAX_PKT_BURST];
 
/*
@@ -247,6 +252,11 @@ static inline __attribute__((always_inline)) uint16_t
 */
int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+   for (j = 0; j < 8 && j < nb_rx; j++) {
+   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+  struct ether_hdr *) + 1);
+   }
+
for (j = 0; j < n; j += 8) {
 
uint32_t pkt_type =
@@ -263,6 +273,11 @@ static inline __attribute__((always_inline)) uint16_t
uint32_t tcp_or_udp = pkt_type &
(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+   for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+  struct ether_hdr *) + 1);
+   }
+
if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h 
b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL

[dpdk-dev] [PATCH v3 7/7] examples/l3fwd: change the guard macro name for header file

2017-05-11 Thread Jianbo Liu
As l3fwd_em_sse.h is renamed to l3fwd_em_sequential.h, change the macro
to __L3FWD_EM_SEQUENTIAL_H__ to maintain consistency.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em_sequential.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_sequential.h 
b/examples/l3fwd/l3fwd_em_sequential.h
index 2b3ec16..c7d477d 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -31,8 +31,8 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __L3FWD_EM_SSE_H__
-#define __L3FWD_EM_SSE_H__
+#ifndef __L3FWD_EM_SEQUENTIAL_H__
+#define __L3FWD_EM_SEQUENTIAL_H__
 
 /**
  * @file
@@ -123,4 +123,4 @@ static inline __attribute__((always_inline)) uint16_t
 
send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
-#endif /* __L3FWD_EM_SSE_H__ */
+#endif /* __L3FWD_EM_SEQUENTIAL_H__ */
-- 
1.8.3.1



[dpdk-dev] [PATCH v3 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs

2017-05-11 Thread Jianbo Liu
New macro to define how many times of hash lookup in one time, and this
makes the code more concise.

Signed-off-by: Jianbo Liu 
---
 examples/l3fwd/l3fwd_em_hlm.h | 241 +-
 1 file changed, 71 insertions(+), 170 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index b9163e3..098b396 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -43,148 +43,65 @@
 #include "l3fwd_em_hlm_neon.h"
 #endif
 
+#ifdef RTE_ARCH_ARM64
+#define EM_HASH_LOOKUP_COUNT 16
+#else
+#define EM_HASH_LOOKUP_COUNT 8
+#endif
+
+
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-   uint8_t portid, uint16_t dst_port[8])
+em_get_dst_port_ipv4xN(struct lcore_conf *qconf, struct rte_mbuf *m[],
+   uint8_t portid, uint16_t dst_port[])
 {
-   int32_t ret[8];
-   union ipv4_5tuple_host key[8];
-
-   get_ipv4_5tuple(m[0], mask0.x, &key[0]);
-   get_ipv4_5tuple(m[1], mask0.x, &key[1]);
-   get_ipv4_5tuple(m[2], mask0.x, &key[2]);
-   get_ipv4_5tuple(m[3], mask0.x, &key[3]);
-   get_ipv4_5tuple(m[4], mask0.x, &key[4]);
-   get_ipv4_5tuple(m[5], mask0.x, &key[5]);
-   get_ipv4_5tuple(m[6], mask0.x, &key[6]);
-   get_ipv4_5tuple(m[7], mask0.x, &key[7]);
-
-   const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
-   &key[4], &key[5], &key[6], &key[7]};
-
-   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0], 8, ret);
-
-   dst_port[0] = (uint8_t) ((ret[0] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[0]]);
-   dst_port[1] = (uint8_t) ((ret[1] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[1]]);
-   dst_port[2] = (uint8_t) ((ret[2] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[2]]);
-   dst_port[3] = (uint8_t) ((ret[3] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[3]]);
-   dst_port[4] = (uint8_t) ((ret[4] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[4]]);
-   dst_port[5] = (uint8_t) ((ret[5] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[5]]);
-   dst_port[6] = (uint8_t) ((ret[6] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[6]]);
-   dst_port[7] = (uint8_t) ((ret[7] < 0) ?
-   portid : ipv4_l3fwd_out_if[ret[7]]);
-
-   if (dst_port[0] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[0]) == 0)
-   dst_port[0] = portid;
-
-   if (dst_port[1] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[1]) == 0)
-   dst_port[1] = portid;
-
-   if (dst_port[2] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[2]) == 0)
-   dst_port[2] = portid;
-
-   if (dst_port[3] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[3]) == 0)
-   dst_port[3] = portid;
-
-   if (dst_port[4] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[4]) == 0)
-   dst_port[4] = portid;
-
-   if (dst_port[5] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[5]) == 0)
-   dst_port[5] = portid;
-
-   if (dst_port[6] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[6]) == 0)
-   dst_port[6] = portid;
-
-   if (dst_port[7] >= RTE_MAX_ETHPORTS ||
-   (enabled_port_mask & 1 << dst_port[7]) == 0)
-   dst_port[7] = portid;
+   int i;
+   int32_t ret[EM_HASH_LOOKUP_COUNT];
+   union ipv4_5tuple_host key[EM_HASH_LOOKUP_COUNT];
+   const void *key_array[EM_HASH_LOOKUP_COUNT];
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+   get_ipv4_5tuple(m[i], mask0.x, &key[i]);
+   key_array[i] = &key[i];
+   }
+
+   rte_hash_lookup_bulk(qconf->ipv4_lookup_struct, &key_array[0],
+EM_HASH_LOOKUP_COUNT, ret);
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
+   dst_port[i] = (uint8_t) ((ret[i] < 0) ?
+   portid : ipv4_l3fwd_out_if[ret[i]]);
 
+   if (dst_port[i] >= RTE_MAX_ETHPORTS ||
+   (enabled_port_mask & 1 << dst_port[i]) == 0)
+   dst_port[i] = portid;
+   }
 }
 
 static inline __attribute__((always_inline)) void
-em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-   uint8_t portid, uint16_t dst_port[8])
+

  1   2   3   >