Re: [dpdk-dev] [PATCH v2] net/virtio-user: check negotiated features before set

2018-08-28 Thread Tiwei Bie
On Mon, Aug 27, 2018 at 02:37:24PM -0400, eric zhang wrote:
> This patch checks negotiated features to see if necessary to offload
> before set the tap device offload capabilities. It also checks if kernel
> support the TUNSETOFFLOAD operation.
> 
> Signed-off-by: eric zhang 
> 
> ---
> v2:
> * don't return failure when failed to set offload to tap
> * check if offloads available when handling VHOST_GET_FEATURES
> ---
>  drivers/net/virtio/virtio_user/vhost_kernel.c |  8 ++--
>  drivers/net/virtio/virtio_user/vhost_kernel_tap.c | 55 
> +--
>  drivers/net/virtio/virtio_user/vhost_kernel_tap.h |  2 +-
>  3 files changed, 47 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c 
> b/drivers/net/virtio/virtio_user/vhost_kernel.c
> index dd24b6b..5c39f26 100644
> --- a/drivers/net/virtio/virtio_user/vhost_kernel.c
> +++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
> @@ -278,9 +278,11 @@ struct vhost_memory_kernel {
>   if (!ret && req_kernel == VHOST_GET_FEATURES) {
>   /* with tap as the backend, all these features are supported
>* but not claimed by vhost-net, so we add them back when
> -  * reporting to upper layer.
> +  * reporting to upper layer. For guest offloads we check if
> +  * they are available in the negotiated features.
>*/
> - *((uint64_t *)arg) |= VHOST_KERNEL_GUEST_OFFLOADS_MASK;
> + *((uint64_t *)arg) |=
> + (dev->features & VHOST_KERNEL_GUEST_OFFLOADS_MASK);

VHOST_GET_FEATURES returns the features supported
by backend instead of the negotiated features. We
need to check whether tap support vnet_hdr etc to
know whether these features are available.

>   *((uint64_t *)arg) |= VHOST_KERNEL_HOST_OFFLOADS_MASK;
>  
>   /* vhost_kernel will not declare this feature, but it does
> @@ -381,7 +383,7 @@ struct vhost_memory_kernel {
>   hdr_size = sizeof(struct virtio_net_hdr);
>  
>   tapfd = vhost_kernel_open_tap(&dev->ifname, hdr_size, req_mq,
> -  (char *)dev->mac_addr);
> +  (char *)dev->mac_addr, dev->features);
>   if (tapfd < 0) {
>   PMD_DRV_LOG(ERR, "fail to open tap for vhost kernel");
>   return -1;
> diff --git a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c 
> b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
> index d036428..5e86404 100644
> --- a/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
> +++ b/drivers/net/virtio/virtio_user/vhost_kernel_tap.c
> @@ -45,21 +45,54 @@
>  
>  #include "vhost_kernel_tap.h"
>  #include "../virtio_logs.h"
> +#include "../virtio_pci.h"
> +
> +static int
> +vhost_kernel_tap_set_offload(int fd, uint64_t feature)

s/feature/features/

> +{
> + unsigned int offload = 0;
> +
> + if (feature & (1ULL << VIRTIO_NET_F_GUEST_CSUM))
> + offload |= TUN_F_CSUM;
> + if (feature & (1ULL << VIRTIO_NET_F_GUEST_TSO4))
> + offload |= TUN_F_TSO4;
> + if (feature & (1ULL << VIRTIO_NET_F_GUEST_TSO6))
> + offload |= TUN_F_TSO6;
> + if (feature & ((1ULL << VIRTIO_NET_F_GUEST_TSO4) |
> + (1ULL << VIRTIO_NET_F_GUEST_TSO6)) &&
> + (feature & (1ULL << VIRTIO_NET_F_GUEST_ECN)))
> + offload |= TUN_F_TSO_ECN;
> + if (feature & (1ULL << VIRTIO_NET_F_GUEST_UFO))
> + offload |= TUN_F_UFO;

The features other than CSUM feature depends on the
CSUM feature. Maybe it's better to do what QEMU does,
i.e. announce these offloads in tap only when the
CSUM feature is available:

https://github.com/qemu/qemu/blob/6ad90805383e/net/tap-linux.c#L247-L257

Thanks

> +
> + if (offload != 0) {
> + /* Check if our kernel supports TUNSETOFFLOAD */
> + if (ioctl(fd, TUNSETOFFLOAD, 0) != 0 && errno == EINVAL) {
> + PMD_DRV_LOG(ERR, "Kernel does't support 
> TUNSETOFFLOAD\n");
> + return -ENOTSUP;
> + }
> +
> + if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
> + offload &= ~TUN_F_UFO;
> + if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
> + PMD_DRV_LOG(ERR, "TUNSETOFFLOAD ioctl() failed: 
> %s\n",
> + strerror(errno));
> + return -1;
> + }
> + }
> + }
> +
> + return 0;
> +}
>  
>  int
>  vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq,
> -  const char *mac)
> +  const char *mac, uint64_t features)
>  {
>   unsigned int tap_features;
>   int sndbuf = INT_MAX;
>   struct ifreq ifr;
>   int tapfd;
> - unsigned int offload =
> - TUN_F_CSUM |
> - TUN_F_TSO4 |
> - TUN_F_TSO6 |
> - TUN_F_TSO_ECN

[dpdk-dev] [RFC] net/virtio-user: avoid parsing process mappings

2018-08-28 Thread Tiwei Bie
Recently some memory APIs were introduced to allow users
to get the file descriptors for each memory segments. We
can leverage those APIs to get rid of the /proc magic on
memory table preparation in vhost-user backend.

Signed-off-by: Tiwei Bie 
---
This patch depends on below patch set:
https://patches.dpdk.org/project/dpdk/list/?series=1040

 drivers/net/virtio/virtio_user/vhost_user.c   | 214 --
 .../net/virtio/virtio_user/virtio_user_dev.c  |  19 ++
 2 files changed, 112 insertions(+), 121 deletions(-)

diff --git a/drivers/net/virtio/virtio_user/vhost_user.c 
b/drivers/net/virtio/virtio_user/vhost_user.c
index ef6e43df8..770cfdc9a 100644
--- a/drivers/net/virtio/virtio_user/vhost_user.c
+++ b/drivers/net/virtio/virtio_user/vhost_user.c
@@ -11,6 +11,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 #include "vhost.h"
 #include "virtio_user_dev.h"
 
@@ -121,133 +124,106 @@ vhost_user_read(int fd, struct vhost_user_msg *msg)
return -1;
 }
 
-struct hugepage_file_info {
-   uint64_t addr;/**< virtual addr */
-   size_t   size;/**< the file size */
-   char path[PATH_MAX];  /**< path to backing file */
+struct walk_arg {
+   struct vhost_memory *vm;
+   int *fds;
+   int region_nr;
 };
 
-/* Two possible options:
- * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
- * array. This is simple but cannot be used in secondary process because
- * secondary process will close and munmap that file.
- * 2. Match HUGEFILE_FMT to find hugepage files directly.
- *
- * We choose option 2.
- */
 static int
-get_hugepage_file_info(struct hugepage_file_info huges[], int max)
+update_memory_region(const struct rte_memseg_list *msl __rte_unused,
+   const struct rte_memseg *ms, void *arg)
 {
-   int idx, k, exist;
-   FILE *f;
-   char buf[BUFSIZ], *tmp, *tail;
-   char *str_underline, *str_start;
-   int huge_index;
-   uint64_t v_start, v_end;
-   struct stat stats;
-
-   f = fopen("/proc/self/maps", "r");
-   if (!f) {
-   PMD_DRV_LOG(ERR, "cannot open /proc/self/maps");
-   return -1;
-   }
-
-   idx = 0;
-   while (fgets(buf, sizeof(buf), f) != NULL) {
-   if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
-   PMD_DRV_LOG(ERR, "Failed to parse address");
-   goto error;
-   }
-
-   tmp = strchr(buf, ' ') + 1; /** skip address */
-   tmp = strchr(tmp, ' ') + 1; /** skip perm */
-   tmp = strchr(tmp, ' ') + 1; /** skip offset */
-   tmp = strchr(tmp, ' ') + 1; /** skip dev */
-   tmp = strchr(tmp, ' ') + 1; /** skip inode */
-   while (*tmp == ' ') /** skip spaces */
-   tmp++;
-   tail = strrchr(tmp, '\n');  /** remove newline if exists */
-   if (tail)
-   *tail = '\0';
-
-   /* Match HUGEFILE_FMT, aka "%s/%smap_%d",
-* which is defined in eal_filesystem.h
-*/
-   str_underline = strrchr(tmp, '_');
-   if (!str_underline)
-   continue;
-
-   str_start = str_underline - strlen("map");
-   if (str_start < tmp)
-   continue;
-
-   if (sscanf(str_start, "map_%d", &huge_index) != 1)
-   continue;
-
-   /* skip duplicated file which is mapped to different regions */
-   for (k = 0, exist = -1; k < idx; ++k) {
-   if (!strcmp(huges[k].path, tmp)) {
-   exist = k;
-   break;
-   }
-   }
-   if (exist >= 0)
-   continue;
-
-   if (idx >= max) {
-   PMD_DRV_LOG(ERR, "Exceed maximum of %d", max);
-   goto error;
-   }
-
-   huges[idx].addr = v_start;
-   huges[idx].size = v_end - v_start; /* To be corrected later */
-   snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
-   idx++;
-   }
-
-   /* correct the size for files who have many regions */
-   for (k = 0; k < idx; ++k) {
-   if (stat(huges[k].path, &stats) < 0) {
-   PMD_DRV_LOG(ERR, "Failed to stat %s, %s\n",
-   huges[k].path, strerror(errno));
-   continue;
-   }
-   huges[k].size = stats.st_size;
-   PMD_DRV_LOG(INFO, "file %s, size %zx\n",
-   huges[k].path, huges[k].size);
-   }
-
-   fclose(f);
-   return idx;
-
-error:
-   fclose(f);
-   return -1;
-}
-
-static int
-prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
-{
-   int i, num;
-   

Re: [dpdk-dev] [PATCH v2] hash table: add an iterator over conflicting entries

2018-08-28 Thread Michel Machado

On 08/26/2018 11:12 PM, Honnappa Nagarahalli wrote:

On 08/23/2018 08:33 PM, Wang, Yipeng1 wrote:

I think with Honnappa suggested "uint32_t* next", we may need a little
bit tricks to make it work with the extra linked list.
The performance may not be optimal though comparing to your original approach.
Is this important to your use case?


 It is. We are developing a DDoS protection system, and have chosen DPDK 
because it was the fastest framework in the evaluations we considered. We need 
to find the conflicting entries when a critical flow table of our system is 
overloaded due to an ongoing attack, so the more efficient we can evaluate the 
merits of an incoming flow against the conflicting flows already in the table, 
the higher the chances we find the flows that should be in the flow table.

 We've compromised with Honnappa under the understanding that once the 
underlying algorithm changes, there would be a review of the interface since 
even rte_hash_iterate() may be affected. I still think that the v2 we proposed 
is the best approach here because it isolates the interface from the underlying 
algorithm.

My only concern was to do with keeping the interfaces across APIs consistent. I 
am fine with changing 'uint32_t *next' as long as we change 'rte_hash_iterate' 
API as well.


   We'll patch rte_hash_iterate() as well in v3.

[ ]'s
Michel Machado


Re: [dpdk-dev] [RFC] net/virtio-user: avoid parsing process mappings

2018-08-28 Thread Burakov, Anatoly

On 28-Aug-18 8:53 AM, Tiwei Bie wrote:

Recently some memory APIs were introduced to allow users
to get the file descriptors for each memory segments. We
can leverage those APIs to get rid of the /proc magic on
memory table preparation in vhost-user backend.

Signed-off-by: Tiwei Bie 
---
This patch depends on below patch set:
https://patches.dpdk.org/project/dpdk/list/?series=1040


Great to see this done so quickly!



  drivers/net/virtio/virtio_user/vhost_user.c   | 214 --
  .../net/virtio/virtio_user/virtio_user_dev.c  |  19 ++
  2 files changed, 112 insertions(+), 121 deletions(-)

diff --git a/drivers/net/virtio/virtio_user/vhost_user.c 
b/drivers/net/virtio/virtio_user/vhost_user.c
index ef6e43df8..770cfdc9a 100644
--- a/drivers/net/virtio/virtio_user/vhost_user.c
+++ b/drivers/net/virtio/virtio_user/vhost_user.c
@@ -11,6 +11,9 @@
  #include 
  #include 
  
+#include 

+#include 
+
  #include "vhost.h"
  #include "virtio_user_dev.h"
  
@@ -121,133 +124,106 @@ vhost_user_read(int fd, struct vhost_user_msg *msg)

return -1;
  }
  
-struct hugepage_file_info {

-   uint64_t addr;/**< virtual addr */
-   size_t   size;/**< the file size */
-   char path[PATH_MAX];  /**< path to backing file */
+struct walk_arg {
+   struct vhost_memory *vm;
+   int *fds;
+   int region_nr;
  };
  
-/* Two possible options:

- * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
- * array. This is simple but cannot be used in secondary process because
- * secondary process will close and munmap that file.
- * 2. Match HUGEFILE_FMT to find hugepage files directly.
- *
- * We choose option 2.
- */
  static int
-get_hugepage_file_info(struct hugepage_file_info huges[], int max)
+update_memory_region(const struct rte_memseg_list *msl __rte_unused,
+   const struct rte_memseg *ms, void *arg)
  {
-   int idx, k, exist;
-   FILE *f;
-   char buf[BUFSIZ], *tmp, *tail;
-   char *str_underline, *str_start;
-   int huge_index;
-   uint64_t v_start, v_end;
-   struct stat stats;
-
-   f = fopen("/proc/self/maps", "r");
-   if (!f) {
-   PMD_DRV_LOG(ERR, "cannot open /proc/self/maps");
-   return -1;
-   }
-
-   idx = 0;
-   while (fgets(buf, sizeof(buf), f) != NULL) {
-   if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
-   PMD_DRV_LOG(ERR, "Failed to parse address");
-   goto error;
-   }
-
-   tmp = strchr(buf, ' ') + 1; /** skip address */
-   tmp = strchr(tmp, ' ') + 1; /** skip perm */
-   tmp = strchr(tmp, ' ') + 1; /** skip offset */
-   tmp = strchr(tmp, ' ') + 1; /** skip dev */
-   tmp = strchr(tmp, ' ') + 1; /** skip inode */
-   while (*tmp == ' ') /** skip spaces */
-   tmp++;
-   tail = strrchr(tmp, '\n');  /** remove newline if exists */
-   if (tail)
-   *tail = '\0';
-
-   /* Match HUGEFILE_FMT, aka "%s/%smap_%d",
-* which is defined in eal_filesystem.h
-*/
-   str_underline = strrchr(tmp, '_');
-   if (!str_underline)
-   continue;
-
-   str_start = str_underline - strlen("map");
-   if (str_start < tmp)
-   continue;
-
-   if (sscanf(str_start, "map_%d", &huge_index) != 1)
-   continue;
-
-   /* skip duplicated file which is mapped to different regions */
-   for (k = 0, exist = -1; k < idx; ++k) {
-   if (!strcmp(huges[k].path, tmp)) {
-   exist = k;
-   break;
-   }
-   }
-   if (exist >= 0)
-   continue;
-
-   if (idx >= max) {
-   PMD_DRV_LOG(ERR, "Exceed maximum of %d", max);
-   goto error;
-   }
-
-   huges[idx].addr = v_start;
-   huges[idx].size = v_end - v_start; /* To be corrected later */
-   snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
-   idx++;
-   }
-
-   /* correct the size for files who have many regions */
-   for (k = 0; k < idx; ++k) {
-   if (stat(huges[k].path, &stats) < 0) {
-   PMD_DRV_LOG(ERR, "Failed to stat %s, %s\n",
-   huges[k].path, strerror(errno));
-   continue;
-   }
-   huges[k].size = stats.st_size;
-   PMD_DRV_LOG(INFO, "file %s, size %zx\n",
-   huges[k].path, huges[k].size);
-   }
-
-   fclose(f);
-   return idx;
-
-error:
-   fclose(f);
-   return -1;
-}
-
-static in

Re: [dpdk-dev] [RFC 0/9] Modularize and enhance DPDK Python scripts

2018-08-28 Thread Burakov, Anatoly

On 14-Aug-18 11:11 AM, Burakov, Anatoly wrote:

On 25-Jun-18 4:59 PM, Anatoly Burakov wrote:

This patchset attempts to create a library out of Python scripts that
come with DPDK, with a goal of enabling external tools to get the same
information about the system DPDK has, and perhaps configure DPDK.

Potential applications include:

* Better setup.sh script (it's long overdue, and you know it!)
* Easier development of better tools for developers (see hugepage-info
   example)
* Easier gathering of DPDK-centric system information, has potential
   applications in troubleshooting tools
* Reduce code duplication for external tools seeking to use the same
   functionality (bind-unbind, cpu layout, etc)
* Add cross-platform support for our scripts (see cpu-layout example
   now working on FreeBSD)

There are a few things to mention. First of all, it's an RFC, so the
fact that it's unfinished and maybe awkward comes with the territory.
I am also aware of the fact that it's a Python library, that it's
outside the scope of DPDK and that it's somewhat a Not-Invented-Here
kind of proposition where there are a lot of externally available
(and arguably much better designed and implemented) tools that do the
same thing.

So the first question i would like to ask is, is the community at all
interested in something like this? Does it have to be part of DPDK
repository? Can it be maintained in a separate repository? How do we
handle updates and dependencies?

I should also mention that it is *not* intended to be a replacement
for udev or any other method of device binding - if anything, it's
the opposite, in that it takes the whole issue out of the question
and thus would make switching to udev or any other device binding
easier since both internal and external tools can utilize the same
Python API.



I would like to draw attention to this RFC again :)



Ping?

--
Thanks,
Anatoly


[dpdk-dev] [PATCH] pipeline: add symmetric crypto to table action

2018-08-28 Thread Zhang, Roy Fan
This patch adds the symmetric crypto action support to pipeline
library. The symmetric crypto action works as the shim layer
between pipeline and DPDK cryptodev and is able to interact with
cryptodev with the control path requests such as session
creation/deletion and data path work to assemble the crypto
operations for received packets.

Change-Id: Id0b547bb10f9e8814b08f5df2343337daca0ae92
Signed-off-by: Zhang, Roy Fan 
---
 lib/librte_pipeline/Makefile   |   2 +-
 lib/librte_pipeline/meson.build|   2 +-
 lib/librte_pipeline/rte_table_action.c | 466 +
 lib/librte_pipeline/rte_table_action.h |  70 +
 4 files changed, 538 insertions(+), 2 deletions(-)

diff --git a/lib/librte_pipeline/Makefile b/lib/librte_pipeline/Makefile
index 84afe98cb..cf265503f 100644
--- a/lib/librte_pipeline/Makefile
+++ b/lib/librte_pipeline/Makefile
@@ -12,7 +12,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_table
-LDLIBS += -lrte_port -lrte_meter -lrte_sched
+LDLIBS += -lrte_port -lrte_meter -lrte_sched -lrte_cryptodev
 
 EXPORT_MAP := rte_pipeline_version.map
 
diff --git a/lib/librte_pipeline/meson.build b/lib/librte_pipeline/meson.build
index dc16ab42f..04e5f5179 100644
--- a/lib/librte_pipeline/meson.build
+++ b/lib/librte_pipeline/meson.build
@@ -5,4 +5,4 @@ version = 3
 allow_experimental_apis = true
 sources = files('rte_pipeline.c', 'rte_port_in_action.c', 'rte_table_action.c')
 headers = files('rte_pipeline.h', 'rte_port_in_action.h', 'rte_table_action.h')
-deps += ['port', 'table', 'meter', 'sched']
+deps += ['port', 'table', 'meter', 'sched', 'cryptodev']
diff --git a/lib/librte_pipeline/rte_table_action.c 
b/lib/librte_pipeline/rte_table_action.c
index 83ffa5ded..a958aa82a 100644
--- a/lib/librte_pipeline/rte_table_action.c
+++ b/lib/librte_pipeline/rte_table_action.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "rte_table_action.h"
 
@@ -1219,6 +1220,428 @@ pkt_work_time(struct time_data *data,
data->time = time;
 }
 
+
+/**
+ * RTE_TABLE_ACTION_CRYPTO
+ */
+
+#define CRYPTO_OP_MASK_CIPHER  0x1
+#define CRYPTO_OP_MASK_AUTH0x2
+#define CRYPTO_OP_MASK_AEAD0x4
+#define CRYPTO_IV_OFFSET   \
+   sizeof(struct rte_crypto_op) + sizeof(struct rte_crypto_sym_op)
+
+struct crypto_op_sym_iv_aad {
+   struct rte_crypto_op op;
+   struct rte_crypto_sym_op sym_op;
+   union {
+   struct {
+   uint8_t cipher_iv[
+   RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX];
+   uint8_t auth_iv[
+   RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX];
+   } cipher_auth;
+
+   struct {
+   uint8_t iv[RTE_TABLE_ACTION_SYM_CRYPTO_IV_SIZE_MAX];
+   uint8_t aad[RTE_TABLE_ACTION_SYM_CRYPTO_AAD_SIZE_MAX];
+   } aead_iv_aad;
+
+   } iv_aad;
+};
+
+struct sym_crypto_data {
+
+   union {
+   struct {
+
+   /** Length of cipher iv. */
+   uint16_t cipher_iv_len;
+
+   /** Offset from start of IP header to the cipher iv. */
+   uint16_t cipher_iv_data_offset;
+
+   /** Length of cipher iv to be updated in the mbuf. */
+   uint16_t cipher_iv_update_len;
+
+   /** Offset from start of IP header to the auth iv. */
+   uint16_t auth_iv_data_offset;
+
+   /** Length of auth iv in the mbuf. */
+   uint16_t auth_iv_len;
+
+   /** Length of auth iv to be updated in the mbuf. */
+   uint16_t auth_iv_update_len;
+
+   } cipher_auth;
+   struct {
+
+   /** Length of iv. */
+   uint16_t iv_len;
+
+   /** Offset from start of IP header to the aead iv. */
+   uint16_t iv_data_offset;
+
+   /** Length of iv to be updated in the mbuf. */
+   uint16_t iv_update_len;
+
+   /** Length of aad */
+   uint16_t aad_len;
+
+   /** Offset from start of IP header to the aad. */
+   uint16_t aad_data_offset;
+
+   /** Length of aad to updated in the mbuf. */
+   uint16_t aad_update_len;
+
+   } aead;
+   };
+
+   /** Offset from start of IP header to the data. */
+   uint16_t data_offset;
+
+   /** Digest length. */
+   uint16_t digest_len;
+
+   /** block size */
+   uint16_t block_size;
+
+   /** Mask of crypto operation */
+   uint16_t op_mask;
+
+   /** Session pointer. */
+   struct rte_cryptodev_sym_sess

[dpdk-dev] [RFC v2 1/1] eventdev: add distributed software (DSW) event device

2018-08-28 Thread Mattias Rönnblom
The distributed software eventdev is a parallel implementation of the
eventdev API, which distributes the task of scheduling events among
all the eventdev ports and the lcore threads using them.

Signed-off-by: Mattias Rönnblom 
---
 config/common_base|5 +
 doc/guides/eventdevs/dsw.rst  |   97 ++
 doc/guides/eventdevs/index.rst|1 +
 drivers/event/Makefile|1 +
 drivers/event/dsw/Makefile|   28 +
 drivers/event/dsw/dsw_evdev.c |  432 ++
 drivers/event/dsw/dsw_evdev.h |  288 
 drivers/event/dsw/dsw_event.c | 1261 +
 drivers/event/dsw/dsw_sort.h  |   48 +
 drivers/event/dsw/dsw_xstats.c|  285 
 drivers/event/dsw/meson.build |8 +
 .../event/dsw/rte_pmd_dsw_event_version.map   |3 +
 drivers/event/meson.build |2 +-
 mk/rte.app.mk |1 +
 14 files changed, 2459 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/eventdevs/dsw.rst
 create mode 100644 drivers/event/dsw/Makefile
 create mode 100644 drivers/event/dsw/dsw_evdev.c
 create mode 100644 drivers/event/dsw/dsw_evdev.h
 create mode 100644 drivers/event/dsw/dsw_event.c
 create mode 100644 drivers/event/dsw/dsw_sort.h
 create mode 100644 drivers/event/dsw/dsw_xstats.c
 create mode 100644 drivers/event/dsw/meson.build
 create mode 100644 drivers/event/dsw/rte_pmd_dsw_event_version.map

diff --git a/config/common_base b/config/common_base
index 4bcbaf923..c43f5139d 100644
--- a/config/common_base
+++ b/config/common_base
@@ -614,6 +614,11 @@ CONFIG_RTE_LIBRTE_PMD_SKELETON_EVENTDEV_DEBUG=n
 #
 CONFIG_RTE_LIBRTE_PMD_SW_EVENTDEV=y
 
+#
+# Compile PMD for distributed software event device
+#
+CONFIG_RTE_LIBRTE_PMD_DSW_EVENTDEV=y
+
 #
 # Compile PMD for octeontx sso event device
 #
diff --git a/doc/guides/eventdevs/dsw.rst b/doc/guides/eventdevs/dsw.rst
new file mode 100644
index 0..de41ae9d3
--- /dev/null
+++ b/doc/guides/eventdevs/dsw.rst
@@ -0,0 +1,97 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+Copyright(c) 2017 Intel Corporation.
+Copyright(c) 2018 Ericsson AB
+
+Distributed Software Eventdev Poll Mode Driver
+==
+
+The distributed software eventdev is a parallel implementation of the
+eventdev API, which distributes the task of scheduling events among
+all the eventdev ports and the lcore threads using them.
+
+Features
+
+
+Queues
+ * Atomic
+ * Parallel
+ * Single-Link
+
+Ports
+ * Load balanced (for Atomic, Ordered, Parallel queues)
+ * Single Link (for single-link queues)
+
+Configuration and Options
+-
+
+The distributed software eventdev is a vdev device, and as such can be
+created from the application code, or from the EAL command line:
+
+* Call ``rte_vdev_init("event_dsw0")`` from the application
+
+* Use ``--vdev="event_dsw0"`` in the EAL options, which will call
+  rte_vdev_init() internally
+
+Example:
+
+.. code-block:: console
+
+./your_eventdev_application --vdev="event_dsw0"
+
+Limitations
+---
+
+Unattended Ports
+
+
+The distributed software eventdev uses an internal signaling schema
+between the ports to achieve load balancing. In order for this to
+work, the application must perform enqueue and/or dequeue operations
+on all ports.
+
+Producer-only ports which currently have no events to enqueue should
+periodically call rte_event_enqueue_burst() with a zero-sized burst.
+
+Ports left unattended for longer periods of time will prevent load
+balancing, and also cause traffic interruptions on the flows which
+are in the process of being migrated.
+
+Output Buffering
+
+
+For efficiency reasons, the distributed software eventdev might not
+send enqueued events immediately to the destination port, but instead
+store them in an internal buffer in the source port.
+
+In case no more events are enqueued on a port with buffered events,
+these events will be sent after the application has performed a number
+of enqueue and/or dequeue operations.
+
+For explicit flushing, an application may call
+rte_event_enqueue_burst() with a zero-sized burst.
+
+
+Priorities
+~~
+
+The distributed software eventdev does not support event priorities.
+
+Ordered Queues
+~~
+
+The distributed software eventdev does not support the ordered queue type.
+
+
+"All Types" Queues
+~~
+
+The distributed software eventdev does not support queues of type
+RTE_EVENT_QUEUE_CFG_ALL_TYPES, which allow both atomic, ordered, and
+parallel events on the same queue.
+
+Dynamic Link/Unlink
+~~~
+
+The distributed software eventdev does not support calls to
+rte_event_port_link() or rte_event_port_unlink() after
+rte_event_dev_start() has been called.
diff --git a/doc/guides/eventdevs/index.r

[dpdk-dev] [RFC v2 0/1] A Distributed Software Event Device

2018-08-28 Thread Mattias Rönnblom
v2:
* Added support for Meson builds.
* Eventdev 'xstats' support is now mandatory.
* Added check in dsw_probe() to allow secondary processes.
* rte_event_dev_stop() now runs the flush callback.
* Added documentation.
* Fixed uninitialized-use warning in ‘dsw_port_consider_migration'.
* Removed some dead (#if 0) debugging code.
* Version .map file is bumped to 18.11.
* Header files sorted in alphabetic order, newline after declarations
  and various other coding style-related improvements.

This is the Distributed Software (DSW) event device, which distributes
the task of scheduling events among all the eventdev ports and their
lcore threads.

DSW is primarily designed for atomic-only queues, but also supports
single-link and parallel queues.

(DSW would be more accurately described as 'parallel', but since that
term is used to describe an eventdev queue type, it's referred to as
'distributed', to avoid suggesting it's somehow biased toward parallel
queues.)

Event Scheduling


Internally, DSW hashes an eventdev flow id to a 15-bit "flow
hash". For each eventdev queue, there's a table mapping a flow hash to
an eventdev port. That port is considered the owner of the
flow. Owners are randomly picked at initialization time, among the
ports serving (i.e. are linked to) that queue.

The scheduling of an event to a port is done (by the sender port) at
time of the enqueue operation, and in most cases simply consists of
hashing the flow id and performing a lookup in the destination queue's
table. Each port has an MP/SC event ring to which the events are
enqueued. This means events go directly port-to-port, typically
meaning core-to-core.

Port Load Measurement
=

DSW includes a concept of port load. The event device keeps track of
transitions between "idle" and "busy" (or vice versa) on a per-port
basis, compares this to the wall time passed, and computes to what
extent the port was busy (for a certain interval). A port transitions
to "busy" on a non-zero dequeue, and again back to "idle" at the point
it performs a dequeue operation returning zero events.

Flow Migration
==

Periodically, hidden to the API user and as a part of a normal
enqueue/dequeue operations, a port updates its load estimate, and in
case the load has reached a certain threshold, considers moving one of
its flow to a different, more lightly loaded, port. This process is
called migration.

Migration Strategy
~~

The DSW migration strategy is to move a small, but yet active flow. To
quickly find which are the active flows (w/o resorting to scanning
through the tables and/or keeping per-event counters), each port
maintains a list of the last 128 events it has dequeued. If there are
lightly-loaded enough target ports, it will attempt to migrate one of
those flows, starting with the smallest. The size is estimated by the
number of events seen on that flow, in that small sample of events.

A good migration strategy, based on reasonably good estimates of port
and current flow event rates, is key for proper load balancing in a
DSW-style event device.

Migration Process
~

If the prerequisites are met, and a migration target flow and port is
found, the owning (source) port will initiate the migration
process. For parallel queues it's a very straightforward operation -
simply a table update. For atomic queues, in order to maintain their
semantics, it's a fair bit more elaborate a procedure.

A high-level view the migration process is available[1] in the form a
sequence diagram.

Much simplified, it consist of the source port sending messages to all
ports configured, asking them to "pause" the to-be-migrated flow. Such
ports will flush their output buffers and provide a confirmation back
to the source port.

Each port holds a list of which flows are paused. Upon the enqueue of
an event belonging to a paused flow, it will be accepted into the
machinery, but kept in a paused-events buffer located on the sending
port.

After receiving confirmations from all ports, the source port will
make sure its application-level user has finished processing of all
events related to the migrating flow, update the relevant queue's
table, and forward all unprocessed events (in its input event ring) to
the new target port.

The source port will then send out a request to "unpause" the flow to
all ports. Upon receiving such a request, the port will flush any
buffered (paused) events related to the paused flow, and provide a
confirmation.

All the signaling are done on regular DPDK rings (separate from the
event-carrying rings), and are pulled as a part of normal
enqueue/dequeue operation.

The migrations can be made fairly rapidly (in the range of a couple
hundred us, or even faster), but the algorithm, load measurement and
migration interval parameters must be carefully chosen not to cause
the system to oscillate or otherwise misbehave.

The migration rate is primarily limited by eventdev enqueu

Re: [dpdk-dev] [RFC] net/virtio-user: avoid parsing process mappings

2018-08-28 Thread Tiwei Bie
On Tue, Aug 28, 2018 at 09:12:38AM +0100, Burakov, Anatoly wrote:
> On 28-Aug-18 8:53 AM, Tiwei Bie wrote:
[...]
> >   
> > -   for (i = 0; i < num; ++i) {
> > -   mr = &msg->payload.memory.regions[i];
> > -   mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
> > -   mr->userspace_addr = huges[i].addr;
> > -   mr->memory_size = huges[i].size;
> > -   mr->mmap_offset = 0;
> > -   fds[i] = open(huges[i].path, O_RDWR);
> > +   start_addr = (uint64_t)(uintptr_t)ms->addr;
> > +   end_addr = start_addr + ms->len;
> > +
> > +   /*
> > +* XXX workaround!
> > +*
> > +* For --single-file-segments case, offset should be:
> > +* offset = rte_fbarray_find_idx(&msl->memseg_arr, ms) * msl->page_sz;
> > +*
> > +* But it's not true for non-single-file-segments case.
> > +*
> > +* This is a temporary workaround which assumes the file will
> > +* also be mapped from the beginning in --single-file-segments
> > +* case. It should be replaced when we have a memory API to
> > +* get the offset.
> 
> Yes, this is an unfortunate consequence of having split personalities in 
> memory subsystem. A good solution would be to just deprecate 
> non-single-file segments mode and always use it, but we cannot do that 
> because we support kernel versions that do not support fallocate() on 
> hugetlbfs (and it still leaves legacy mem mode, which effectively 
> behaves like non-single-file segments as far as virtio is concerned).
> 
> How about we add this API in v2 for seg fd patchset?

Sure, thanks!

Besides, I saw another minor issue. When --legacy-mem
and --single-file-segments are both specified, the EAL
init doesn't fail, but fd can't be got successfully.

> 
> > +*/
> > +   offset = 0;
> > +
[...]
> > @@ -109,9 +111,24 @@ is_vhost_user_by_type(const char *path)
> >   int
> >   virtio_user_start_device(struct virtio_user_dev *dev)
> >   {
> > +   struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> > uint64_t features;
> > int ret;
> >   
> > +   /*
> > +* XXX workaround!
> > +*
> > +* We need to make sure that the locks will be
> > +* taken in the correct order to avoid deadlocks.
> > +*
> > +* Before releasing this lock, this thread should
> > +* not trigger any memory hotplug events.
> > +*
> > +* This is a temporary workaround, and should be
> > +* replaced when we get proper supports from the
> > +* memory subsystem in the future.
> > +*/
> > +   rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
> 
> This feels like it should be a separate patch, because it fixes the 
> issue that exists independently of whether we have segment fd API or not.

Yeah, I'll put it in a separate patch.

> 
> > pthread_mutex_lock(&dev->mutex);
> >   
> > if (is_vhost_user_by_type(dev->path) && dev->vhostfd < 0)
> > @@ -152,10 +169,12 @@ virtio_user_start_device(struct virtio_user_dev *dev)
> >   
> > dev->started = true;
> > pthread_mutex_unlock(&dev->mutex);
> > +   rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> >   
> > return 0;
> >   error:
> > pthread_mutex_unlock(&dev->mutex);
> > +   rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
> > /* TODO: free resource here or caller to check */
> > return -1;
> >   }
> > 
> 
> 
> -- 
> Thanks,
> Anatoly


Re: [dpdk-dev] [dpdk-stable] [PATCH v4] net/bonding: per-slave intermediate rx ring

2018-08-28 Thread Matan Azrad


From: Chas Williams
>On Mon, Aug 27, 2018 at 11:30 AM Matan Azrad  wrote:

>>>Because rings are generally quite efficient.
>>
>>But you are using a ring in addition to regular array management, it must 
>>hurt performance of the bonding PMD
>>(means the bonding itself - not the slaves PMDs which are called from the 
>>bonding)
>
>It adds latency.

And by that hurts the application performance because it takes more CPU time in 
the bonding PMD.

>It increases performance because we spend less CPU time reading from the PMDs.

So, it's hack in the bonding PMD to improve some slaves code performance but 
hurt the bonding code performance,
Over all the performance we gain for those slaves improves the application 
performance only when working with those slaves. 
But may hurt the application performance when working with other slaves.

>  This means we have more CPU to use for
>post processing (i.e. routing).



>>>Bonding is in a middle ground between application and PMD.
>>Yes.
>>>What bonding is doing, may not improve all applications.
>>Yes, but it can be solved using some bonding modes.
>>> If using a ring to buffer the vectorized receive routines, improves your 
>>> particular application,
>>>that's great. 
>>It may be not great and even bad for some other PMDs which are not 
>>vectororized.
>>
>>> However, I don't think I can say that it would help all
>>>applications.  As you point out, there is overhead associated with
>>>a ring.
>>Yes.
>>>Bonding's receive burst isn't especially efficient (in mode 4).
>>
>>Why?
>>
>>It makes a copy of the slaves, has a fair bit of stack usage, 
>>needs to check the slave status, and needs to examine each
>>packet to see if it is a slow protocol packet.  So each
>>packet is essentially read twice.  The fast queue code for mode 4
>>avoids some of this (and probably ignores checking collecting
>>incorrectly).  If you find a slow protocol packet, you need to
>>chop it out of the array with memmove due to overlap.
>
>Agree.
>So to improve the bonding performance you need to optimize the aboves problems.
>There is no connection to the ring.
>
>And as I have described numerous times, these problems
>can't be easily fixed and preserve the existing API.

Sometimes we need to work harder to see a gain for all.
We should not apply a patch because it is easy and show a gain for specific 
scenarios.

>>> Bonding benefits from being able to read as much as possible (within limits 
>>> of
>>>course, large reads would blow out caches) from each slave.
>>
>>The slaves PMDs can benefits in the same way.
>>
>>>It can't return all that data though because applications tend to use the 
>>>burst size that would be efficient for a typical PMD.
>>
>>What is the preferred burst size of the bonding? Maybe the application should 
>>use it when they are using bonding.
>>
>>The preferred burst size for bonding would be the sum of all the
>>slaves ideal read size.  However, that's not likely to be simple
>>since most applications decide early the size for the read/write
>>burst operations.
>> 
>>>An alternative might be to ask bonding applications to simply issue larger 
>>>reads for
>>>certain modes.  That's probably not as easy as it sounds given the
>>>way that the burst length effects multiplexing.
>>
>>Can you explain it more?
>>
>>A longer burst size on one PMD will tend to favor that PMD
>>over others.  It will fill your internal queues with more 
>>of its packets.
>
>Agree, it's about fairness.
> 
>>
>>>Another solution might be just alternatively poll the individual
>>>slaves on each rx burst.  But that means you need to poll at a
>>>faster rate.  Depending on your application, you might not be
>>>able to do that.
>
>>Again, can you be more precise in the above explanation?
>>
>>If the application knows that there are two slaves backing
>>a bonding interface, the application could just read twice
>>from the bonding interface, knowing that the bonding 
>>interface is going to alternate between the slaves.  But
>>this requires the application to know things about the bonding
>>PMD, like the number of slaves.
>
>Why should the application poll twice?
>Poll slave 0, than process it's packets, poll slave 1 than process its 
>packets...
>What is the problem?
>
>Because let's say that each slave is 10G and you are using
>
>link aggregation with two slaves.  If you have tuned your
>
>application on the assumption that a PMD is approximately
>10G, then you are going to be under polling the bonding PMD.
>For the above to work, you need to ensure that the application
>is polling sufficiently to keep up.

But each poll will be shorter, no slaves loops, only one slave call.

>>>  We can avoid this scheduling overhead by just
>>>doing the extra reads in bonding and buffering in a ring.
>>>
>>>Since bonding is going to be buffering everything in a ring,
>>
>>? I don't familiar with it. For now I don't think we need a ring.
>>
>>We have some empirical testing that shows perfo

[dpdk-dev] [PATCH v6 1/2] bus/pci: harmonize and document rte_pci_read_config return value

2018-08-28 Thread Luca Boccassi
On Linux, rte_pci_read_config on success returns the number of read
bytes, but on BSD it returns 0.
Document the return values, and have BSD behave as Linux does.

At least one case (bnx2x PMD) treats 0 as an error, so the change
makes sense also for that.

Signed-off-by: Luca Boccassi 
---
 drivers/bus/pci/bsd/pci.c | 4 +++-
 drivers/bus/pci/rte_bus_pci.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c
index 655b34b7e4..175d83cf1b 100644
--- a/drivers/bus/pci/bsd/pci.c
+++ b/drivers/bus/pci/bsd/pci.c
@@ -439,6 +439,8 @@ int rte_pci_read_config(const struct rte_pci_device *dev,
 {
int fd = -1;
int size;
+   /* Copy Linux implementation's behaviour */
+   const int return_len = len;
struct pci_io pi = {
.pi_sel = {
.pc_domain = dev->addr.domain,
@@ -469,7 +471,7 @@ int rte_pci_read_config(const struct rte_pci_device *dev,
}
close(fd);
 
-   return 0;
+   return return_len;
 
  error:
if (fd >= 0)
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index 0d1955ffe0..df8f64798d 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -219,6 +219,8 @@ void rte_pci_unregister(struct rte_pci_driver *driver);
  *   The length of the data buffer.
  * @param offset
  *   The offset into PCI config space
+ * @return
+ *  Number of bytes read on success, negative on error.
  */
 int rte_pci_read_config(const struct rte_pci_device *device,
void *buf, size_t len, off_t offset);
-- 
2.18.0



[dpdk-dev] [PATCH v6 2/2] net/virtio: fix PCI config err handling

2018-08-28 Thread Luca Boccassi
From: Brian Russell 

In virtio_read_caps and vtpci_msix_detect, rte_pci_read_config returns
the number of bytes read from PCI config or < 0 on error.
If less than the expected number of bytes are read then log the
failure and return rather than carrying on with garbage.

Fixes: 6ba1f63b5ab0 ("virtio: support specification 1.0")

Signed-off-by: Brian Russell 
Signed-off-by: Luca Boccassi 
---
v2: handle additional rte_pci_read_config incomplete reads
v3: do not handle rte_pci_read_config of virtio cap, added in v2,
as it's less clear what the right thing to do there is
v4: do a more robust check - first check what the vendor is, and
skip the cap entirely if it's not what we are looking for.
v5: fetch only 2 flags bytes if the vndr is PCI_CAP_ID_MSIX
v6: fix 32bit build by changing the printf format specifier, fix patch title

 drivers/net/virtio/virtio_pci.c | 65 -
 1 file changed, 48 insertions(+), 17 deletions(-)

diff --git a/drivers/net/virtio/virtio_pci.c b/drivers/net/virtio/virtio_pci.c
index 6bd22e54a6..b6a3c80b4d 100644
--- a/drivers/net/virtio/virtio_pci.c
+++ b/drivers/net/virtio/virtio_pci.c
@@ -567,16 +567,18 @@ virtio_read_caps(struct rte_pci_device *dev, struct 
virtio_hw *hw)
}
 
ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
-   if (ret < 0) {
-   PMD_INIT_LOG(DEBUG, "failed to read pci capability list");
+   if (ret != 1) {
+   PMD_INIT_LOG(DEBUG,
+"failed to read pci capability list, ret %d", ret);
return -1;
}
 
while (pos) {
-   ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
-   if (ret < 0) {
-   PMD_INIT_LOG(ERR,
-   "failed to read pci cap at pos: %x", pos);
+   ret = rte_pci_read_config(dev, &cap, 2, pos);
+   if (ret != 2) {
+   PMD_INIT_LOG(DEBUG,
+"failed to read pci cap at pos: %x ret %d",
+pos, ret);
break;
}
 
@@ -586,7 +588,16 @@ virtio_read_caps(struct rte_pci_device *dev, struct 
virtio_hw *hw)
 * 1st byte is cap ID; 2nd byte is the position of next
 * cap; next two bytes are the flags.
 */
-   uint16_t flags = ((uint16_t *)&cap)[1];
+   uint16_t flags;
+
+   ret = rte_pci_read_config(dev, &flags, sizeof(flags),
+   pos + 2);
+   if (ret != sizeof(flags)) {
+   PMD_INIT_LOG(DEBUG,
+"failed to read pci cap at pos:"
+" %x ret %d", pos + 2, ret);
+   break;
+   }
 
if (flags & PCI_MSIX_ENABLE)
hw->use_msix = VIRTIO_MSIX_ENABLED;
@@ -601,6 +612,14 @@ virtio_read_caps(struct rte_pci_device *dev, struct 
virtio_hw *hw)
goto next;
}
 
+   ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+   if (ret != sizeof(cap)) {
+   PMD_INIT_LOG(DEBUG,
+"failed to read pci cap at pos: %x ret %d",
+pos, ret);
+   break;
+   }
+
PMD_INIT_LOG(DEBUG,
"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u",
pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
@@ -689,25 +708,37 @@ enum virtio_msix_status
 vtpci_msix_detect(struct rte_pci_device *dev)
 {
uint8_t pos;
-   struct virtio_pci_cap cap;
int ret;
 
ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
-   if (ret < 0) {
-   PMD_INIT_LOG(DEBUG, "failed to read pci capability list");
+   if (ret != 1) {
+   PMD_INIT_LOG(DEBUG,
+"failed to read pci capability list, ret %d", ret);
return VIRTIO_MSIX_NONE;
}
 
while (pos) {
-   ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
-   if (ret < 0) {
-   PMD_INIT_LOG(ERR,
-   "failed to read pci cap at pos: %x", pos);
+   uint8_t cap[2];
+
+   ret = rte_pci_read_config(dev, cap, sizeof(cap), pos);
+   if (ret != sizeof(cap)) {
+   PMD_INIT_LOG(DEBUG,
+"failed to read pci cap at pos: %x ret %d",
+pos, ret);
break;
}
 
-   if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
-

Re: [dpdk-dev] [PATCH v5 2/2] virtio: fix PCI config err handling

2018-08-28 Thread Luca Boccassi
On Tue, 2018-08-28 at 14:43 +0800, Tiwei Bie wrote:
> I just noticed the title. It should be "net/virtio: xxx",
> instead of "virtio: xxx".

Fixed

> On Mon, Aug 27, 2018 at 05:52:40PM +0100, Luca Boccassi wrote:
> [...]
> > +   ret = rte_pci_read_config(dev, &flags,
> > sizeof(flags),
> > +   pos + sizeof(cap));
> > +   if (ret != sizeof(flags)) {
> > +   PMD_INIT_LOG(DEBUG,
> > +    "failed to read pci
> > cap at pos:"
> > +    " %lx ret %d", pos +
> > sizeof(cap),
> > +    ret);
> 
> In file included from drivers/net/virtio/virtio_pci.c:15:0:
> drivers/net/virtio/virtio_pci.c: In function ‘vtpci_msix_detect’:
> drivers/net/virtio/virtio_logs.h:13:3: error: format ‘%lx’ expects
> argument of type ‘long unsigned int’, but argument 5 has type
> ‘unsigned int’ [-Werror=format=]
>    "%s(): " fmt "\n", __func__, ##args)
>    ^
> drivers/net/virtio/virtio_pci.c:737:5: note: in expansion of macro
> ‘PMD_INIT_LOG’
>  PMD_INIT_LOG(DEBUG,
>  ^
> cc1: all warnings being treated as errors
> 
> I got above build issues in 32bit build.
> 
> 
> Apart from that,
> 
> Reviewed-by: Tiwei Bie 
> 
> Thanks!

So the 32 and 64 bit builds want opposite things, due to the sizeof
type. I changed it to avoid sizeof and just calculate it like in every
other prints of these functions to avoid the issue.

-- 
Kind regards,
Luca Boccassi


Re: [dpdk-dev] [RFC] ethdev: add generic MAC address rewrite actions

2018-08-28 Thread Andrew Rybchenko

On 08/07/2018 05:20 PM, Jack Min wrote:

There is a need to offload rewrite MAC address for both destination and source
from the matched flow

The proposed actions could make above easily achieved


Signed-off-by: Xiaoyu Min mailto:jack...@mellanox.com>>


The only question I have is which Ethernet header is edited, if there 
are many
Ethernet headers in the case of tunnels. I guess the outermost, but it 
should

be stated in the description.



Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

2018-08-28 Thread Kokkilagadda, Kiran
In this instance there won't be any problem, as until the value of fifo->write 
changes, this loop won't get executed. As of now we didn't see any issue with 
it and for performance reasons, we don't want to keep read barrier.




From: Gavin Hu 
Sent: Monday, August 27, 2018 9:10 PM
To: Ferruh Yigit; Kokkilagadda, Kiran; Jacob, Jerin
Cc: dev@dpdk.org; Honnappa Nagarahalli
Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

External Email

This fix is not complete, kni_fifo_get requires a read fence also, otherwise it 
probably gets stale data on a weak ordering platform.

> -Original Message-
> From: dev  On Behalf Of Ferruh Yigit
> Sent: Monday, August 27, 2018 10:08 PM
> To: Kiran Kumar ;
> jerin.ja...@caviumnetworks.com
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer
> synchronization
>
> On 8/16/2018 10:55 AM, Kiran Kumar wrote:
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. So adding a write
> > barrier to make sure the values being synced before updating fifo_write.
> >
> > Fixes: 3fc5ca2f6352 ("kni: initial import")
> >
> > Signed-off-by: Kiran Kumar 
> > Acked-by: Jerin Jacob 
>
> Acked-by: Ferruh Yigit 
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.


Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

2018-08-28 Thread Kokkilagadda, Kiran
I need to add the same write barrier change in kernel side kni_fifo_put. I will 
add it and will send v3.



From: Kokkilagadda, Kiran
Sent: Tuesday, August 28, 2018 4:13:59 PM
To: Gavin Hu; Ferruh Yigit; Jacob, Jerin
Cc: dev@dpdk.org; Honnappa Nagarahalli
Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization


In this instance there won't be any problem, as until the value of fifo->write 
changes, this loop won't get executed. As of now we didn't see any issue with 
it and for performance reasons, we don't want to keep read barrier.




From: Gavin Hu 
Sent: Monday, August 27, 2018 9:10 PM
To: Ferruh Yigit; Kokkilagadda, Kiran; Jacob, Jerin
Cc: dev@dpdk.org; Honnappa Nagarahalli
Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

External Email

This fix is not complete, kni_fifo_get requires a read fence also, otherwise it 
probably gets stale data on a weak ordering platform.

> -Original Message-
> From: dev  On Behalf Of Ferruh Yigit
> Sent: Monday, August 27, 2018 10:08 PM
> To: Kiran Kumar ;
> jerin.ja...@caviumnetworks.com
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer
> synchronization
>
> On 8/16/2018 10:55 AM, Kiran Kumar wrote:
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. So adding a write
> > barrier to make sure the values being synced before updating fifo_write.
> >
> > Fixes: 3fc5ca2f6352 ("kni: initial import")
> >
> > Signed-off-by: Kiran Kumar 
> > Acked-by: Jerin Jacob 
>
> Acked-by: Ferruh Yigit 
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.


Re: [dpdk-dev] [RFC] ethdev: add action to swap source and destination MAC to flow API

2018-08-28 Thread Andrew Rybchenko

On 08/27/2018 03:54 PM, Rahul Lakkireddy wrote:

From: Shagun Agrawal 

This action is useful for offloading loopback mode, where the hardware
will swap source and destination MAC address before looping back the
packet. This action can be used in conjunction with other rewrite
actions to achieve MAC layer transparent NAT where the MAC addresses
are swapped before either the source or destination MAC address
is rewritten and NAT is performed.

Signed-off-by: Shagun Agrawal 
Signed-off-by: Rahul Lakkireddy 
---
  app/test-pmd/cmdline_flow.c |  9 +
  app/test-pmd/config.c   |  1 +
  doc/guides/prog_guide/rte_flow.rst  | 15 +++
  doc/guides/testpmd_app_ug/testpmd_funcs.rst |  2 ++
  lib/librte_ethdev/rte_flow.c|  1 +
  lib/librte_ethdev/rte_flow.h|  7 +++
  6 files changed, 35 insertions(+)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index f9260600e..4b83b55c4 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -243,6 +243,7 @@ enum index {
ACTION_VXLAN_DECAP,
ACTION_NVGRE_ENCAP,
ACTION_NVGRE_DECAP,
+   ACTION_MAC_SWAP,
  };
  
  /** Maximum size for pattern in struct rte_flow_item_raw. */

@@ -816,6 +817,7 @@ static const enum index next_action[] = {
ACTION_VXLAN_DECAP,
ACTION_NVGRE_ENCAP,
ACTION_NVGRE_DECAP,
+   ACTION_MAC_SWAP,
ZERO,
  };
  
@@ -2470,6 +2472,13 @@ static const struct token token_list[] = {

.next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
.call = parse_vc,
},
+   [ACTION_MAC_SWAP] = {
+   .name = "mac_swap",
+   .help = "swap source and destination mac address",
+   .priv = PRIV_ACTION(MAC_SWAP, 0),
+   .next = NEXT(NEXT_ENTRY(ACTION_NEXT)),
+   .call = parse_vc,
+   },
  };
  
  /** Remove and return last entry from argument stack. */

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 14ccd6864..b7393967a 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1153,6 +1153,7 @@ static const struct {
   sizeof(struct rte_flow_action_of_pop_mpls)),
MK_FLOW_ACTION(OF_PUSH_MPLS,
   sizeof(struct rte_flow_action_of_push_mpls)),
+   MK_FLOW_ACTION(MAC_SWAP, 0),
  };
  
  /** Compute storage space needed by action configuration and copy it. */

diff --git a/doc/guides/prog_guide/rte_flow.rst 
b/doc/guides/prog_guide/rte_flow.rst
index b305a72a5..530dbc504 100644
--- a/doc/guides/prog_guide/rte_flow.rst
+++ b/doc/guides/prog_guide/rte_flow.rst
@@ -2076,6 +2076,21 @@ RTE_FLOW_ERROR_TYPE_ACTION error should be returned.
  
  This action modifies the payload of matched flows.
  
+Action: ``MAC_SWAP``

+^
+
+Swap source and destination mac address.
+
+.. _table_rte_flow_action_mac_swap:
+
+.. table:: MAC_SWAP
+
+   +---+
+   | Field |
+   +===+
+   | no properties |
+   +---+
+
  Negative types
  ~~
  
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst

index dde205a2b..4f0da4fb6 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -3697,6 +3697,8 @@ This section lists supported actions and their 
attributes, if any.
  - ``nvgre_decap``: Performs a decapsulation action by stripping all headers of
the NVGRE tunnel network overlay from the matched flow.
  
+- ``mac_swap``: Swap source and destination mac address.

+
  Destroying flow rules
  ~
  
diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c

index cff4b5209..04b0b40ea 100644
--- a/lib/librte_ethdev/rte_flow.c
+++ b/lib/librte_ethdev/rte_flow.c
@@ -109,6 +109,7 @@ static const struct rte_flow_desc_data 
rte_flow_desc_action[] = {
   sizeof(struct rte_flow_action_of_pop_mpls)),
MK_FLOW_ACTION(OF_PUSH_MPLS,
   sizeof(struct rte_flow_action_of_push_mpls)),
+   MK_FLOW_ACTION(MAC_SWAP, 0),
  };
  
  static int

diff --git a/lib/librte_ethdev/rte_flow.h b/lib/librte_ethdev/rte_flow.h
index f8ba71cdb..e1fa17b7e 100644
--- a/lib/librte_ethdev/rte_flow.h
+++ b/lib/librte_ethdev/rte_flow.h
@@ -1505,6 +1505,13 @@ enum rte_flow_action_type {
 * error.
 */
RTE_FLOW_ACTION_TYPE_NVGRE_DECAP,
+
+   /**
+* swap the source and destination mac address in ethernet header


Swap the source and destination MAC address in Ethernet header.

May be it is useful to highlight that outermost Ethernet header is edited.

MAC address rewrite actions require Ethernet pattern item. Is it 
required here?



+*
+* No associated configuration structure.
+*/
+   RTE_FLOW_ACTION_TYPE_MAC_SWAP,
  };
  
  /**




Re: [dpdk-dev] 18.08 build error on ppc64el - bool as vector type

2018-08-28 Thread Christian Ehrhardt
On Mon, Aug 27, 2018 at 2:22 PM Adrien Mazarguil 
wrote:

> Hi Christian,
>
> On Wed, Aug 22, 2018 at 05:11:41PM +0200, Christian Ehrhardt wrote:
> > Just FYI the simple change hits similar issues later on.
> >
> > The (not really) proposed patch would have to be extended to be as
> > following.
> > We really need a better solution (or somebody has to convince me that my
> > change is better than a band aid).
>
> Thanks for reporting. I've made a quick investigation on my own and believe
> it's a toolchain issue which may affect more than this PMD; potentially all
> users of stdbool.h (C11) on this platform.
>

Yeah I assumed as much, which is why I was hoping that some of the arch
experts would jump in and say "yeah this is a common thing and correctly
handled like "
I'll continue trying to reach out to people that should know better still
...


> C11's stdbool.h defines a bool macro as _Bool (big B) along with
> true/false. On PPC targets, another file (altivec.h) defines bool as _bool
> (small b) but not true/false:
>
>  #if !defined(__APPLE_ALTIVEC__)
>  /* You are allowed to undef these for C++ compatibility.  */
>  #define vector __vector
>  #define pixel __pixel
>  #define bool __bool
>  #endif
>
> mlx5_nl.c explicitly includes stdbool.h to get the above definitions then
> includes mlx5.h -> rte_ether.h -> ppc_64/rte_memcpy.h -> altivec.h.
>
> For some reason the conflicting bool redefinition doesn't seem to raise any
> warnings, but results in mismatching bool and true/false definitions; an
> integer value cannot be assigned to a bool variable anymore, hence the
> build
> failure.
>
> The inability to assign integer values to bool is, in my opinion, a
> fundamental issue caused by altivec.h. If there is no way to fix this on
> the
> system, there are a couple of workarounds for DPDK, by order of preference:
>
> 1. Always #undef bool after including altivec.h in
>lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h. I do not think
>anyone expects this type to be unusable with true/false or integer
> values
>anyway. The version of altivec.h I have doesn't rely on this macro at
>all so it's probably not a big loss.
>

The undef of a definition in header A by hedaer B can lead to most
interesting, still broken effects.
If e.g. one does
#include 
#include "mlx5.h"

or similar then it would undefine that of stdbool as well right?
In any case, the undefine not only would be suspicious it also fails right
away:

In file included from
/home/ubuntu/deb_dpdk/lib/librte_eal/common/malloc_heap.c:27:
/home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_memalloc.h:30:15:
error: unknown
type name ‘bool’; did you mean ‘_Bool’?
  int socket, bool exact);
  ^~~~
  _Bool
[...]



>Ditto for "pixel" and "vector" keywords. Alternatively you could #define
>__APPLE_ALTIVEC__ before including altivec.h to prevent them from
> getting
>defined in the first place.
>

Interesting I got plenty of these:
In file included from
/home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_common_options.c:25:
/home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39:
warning:
"__APPLE_ALTIVEC__" redefined
#define __APPLE_ALTIVEC__

With a few of it being even errors, but the position of the original define
is interesting.
/home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39: error:
"__APPLE_ALTIVEC__" redefined [-Werror]
#define __APPLE_ALTIVEC__
: note: this is the location of the previous definition

So if being a built-in, shouldn't it ALWAYS be defined and never
over-declare the bool type?

Checking GCC on the platform:
$ gcc -dM -E - < /dev/null | grep ALTI
#define __ALTIVEC__ 1
#define __APPLE_ALTIVEC__ 1


I added an #error in the header and dropped all dpdk changes.
if !defined(__APPLE_ALTIVEC__)
/* You are allowed to undef these for C++ compatibility.  */
#error WOULD REDECLARE BOOL
#define vector __vector

And I get:
gcc -Wp,-MD,./.mlx4.o.d.tmp -Wdate-time -D_FORTIFY_SOURCE=2 -m64 -pthread
  -DRTE_MACHINE_CPUFLAG_PPC64 -DRTE_MACHINE_CPUFLAG_ALTIVEC
-DRTE_MACHINE_CPUFLAG_VSX  -I/home/ubuntu/deb_dpdk/debia
n/build/static-root/include -include
/home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_config.h -O3
-std=c11 -Wall -Wextra -g -I. -D_BSD_SOURCE -D_DEFAULT_SOURCE
-D_XOPEN_SOURCE=600
-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations
-Wold-style-definition -Wpointer-arith -Wcast-align -Wnested-externs
-Wcast-qual -Wformat-nonliteral -Wformat-securi
ty -Wundef -Wwrite-strings -Wdeprecated -Wimplicit-fallthrough=2
-Wno-format-truncation -DALLOW_EXPERIMENTAL_API -Wno-error=cast-qual
-DNDEBUG -UPEDANTIC   -g -g -o mlx4.o -c /home/ubuntu/de
b_dpdk/drivers/net/mlx4/mlx4.c
In file included from
/home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39,
from
/home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_ether.h:21,
from
/home/ubuntu/deb_dpdk/debian/build/static-root/include/rte

Re: [dpdk-dev] 18.08 build error on ppc64el - bool as vector type

2018-08-28 Thread Adrien Mazarguil
On Tue, Aug 28, 2018 at 01:30:12PM +0200, Christian Ehrhardt wrote:
> On Mon, Aug 27, 2018 at 2:22 PM Adrien Mazarguil 
> wrote:
> 
> > Hi Christian,
> >
> > On Wed, Aug 22, 2018 at 05:11:41PM +0200, Christian Ehrhardt wrote:
> > > Just FYI the simple change hits similar issues later on.
> > >
> > > The (not really) proposed patch would have to be extended to be as
> > > following.
> > > We really need a better solution (or somebody has to convince me that my
> > > change is better than a band aid).
> >
> > Thanks for reporting. I've made a quick investigation on my own and believe
> > it's a toolchain issue which may affect more than this PMD; potentially all
> > users of stdbool.h (C11) on this platform.
> >
> 
> Yeah I assumed as much, which is why I was hoping that some of the arch
> experts would jump in and say "yeah this is a common thing and correctly
> handled like "
> I'll continue trying to reach out to people that should know better still
> ...
> 
> 
> > C11's stdbool.h defines a bool macro as _Bool (big B) along with
> > true/false. On PPC targets, another file (altivec.h) defines bool as _bool
> > (small b) but not true/false:
> >
> >  #if !defined(__APPLE_ALTIVEC__)
> >  /* You are allowed to undef these for C++ compatibility.  */
> >  #define vector __vector
> >  #define pixel __pixel
> >  #define bool __bool
> >  #endif
> >
> > mlx5_nl.c explicitly includes stdbool.h to get the above definitions then
> > includes mlx5.h -> rte_ether.h -> ppc_64/rte_memcpy.h -> altivec.h.
> >
> > For some reason the conflicting bool redefinition doesn't seem to raise any
> > warnings, but results in mismatching bool and true/false definitions; an
> > integer value cannot be assigned to a bool variable anymore, hence the
> > build
> > failure.
> >
> > The inability to assign integer values to bool is, in my opinion, a
> > fundamental issue caused by altivec.h. If there is no way to fix this on
> > the
> > system, there are a couple of workarounds for DPDK, by order of preference:
> >
> > 1. Always #undef bool after including altivec.h in
> >lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h. I do not think
> >anyone expects this type to be unusable with true/false or integer
> > values
> >anyway. The version of altivec.h I have doesn't rely on this macro at
> >all so it's probably not a big loss.
> >
> 
> The undef of a definition in header A by hedaer B can lead to most
> interesting, still broken effects.
> If e.g. one does
> #include 
> #include "mlx5.h"
> 
> or similar then it would undefine that of stdbool as well right?
> In any case, the undefine not only would be suspicious it also fails right
> away:
> 
> In file included from
> /home/ubuntu/deb_dpdk/lib/librte_eal/common/malloc_heap.c:27:
> /home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_memalloc.h:30:15:
> error: unknown
> type name ‘bool’; did you mean ‘_Bool’?
>   int socket, bool exact);
>   ^~~~
>   _Bool
> [...]
> 
> 
> 
> >Ditto for "pixel" and "vector" keywords. Alternatively you could #define
> >__APPLE_ALTIVEC__ before including altivec.h to prevent them from
> > getting
> >defined in the first place.
> >
> 
> Interesting I got plenty of these:
> In file included from
> /home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_common_options.c:25:
> /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39:
> warning:
> "__APPLE_ALTIVEC__" redefined
> #define __APPLE_ALTIVEC__
> 
> With a few of it being even errors, but the position of the original define
> is interesting.
> /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39: error:
> "__APPLE_ALTIVEC__" redefined [-Werror]
> #define __APPLE_ALTIVEC__
> : note: this is the location of the previous definition
> 
> So if being a built-in, shouldn't it ALWAYS be defined and never
> over-declare the bool type?
> 
> Checking GCC on the platform:
> $ gcc -dM -E - < /dev/null | grep ALTI
> #define __ALTIVEC__ 1
> #define __APPLE_ALTIVEC__ 1
> 
> 
> I added an #error in the header and dropped all dpdk changes.
> if !defined(__APPLE_ALTIVEC__)
> /* You are allowed to undef these for C++ compatibility.  */
> #error WOULD REDECLARE BOOL
> #define vector __vector
> 
> And I get:
> gcc -Wp,-MD,./.mlx4.o.d.tmp -Wdate-time -D_FORTIFY_SOURCE=2 -m64 -pthread
>   -DRTE_MACHINE_CPUFLAG_PPC64 -DRTE_MACHINE_CPUFLAG_ALTIVEC
> -DRTE_MACHINE_CPUFLAG_VSX  -I/home/ubuntu/deb_dpdk/debia
> n/build/static-root/include -include
> /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_config.h -O3
> -std=c11 -Wall -Wextra -g -I. -D_BSD_SOURCE -D_DEFAULT_SOURCE
> -D_XOPEN_SOURCE=600
> -W -Wall -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations
> -Wold-style-definition -Wpointer-arith -Wcast-align -Wnested-externs
> -Wcast-qual -Wformat-nonliteral -Wformat-securi
> ty -Wundef -Wwrite-strings -Wdeprecated -Wimplicit-fallthrough=2
> -Wno-format-truncation -DALLOW_EXPERIMENTAL_API -Wno-error=cast-qual
> -DNDEBUG -UPEDANTIC   -g 

Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry infrastructure

2018-08-28 Thread Gaëtan Rivet
Hi Ciara,

On Thu, Aug 23, 2018 at 01:08:03PM +0100, Ciara Power wrote:
> This patch adds the infrastructure and initial code for the
> telemetry library.
> 
> A virtual device is used for telemetry, which is included in
> this patch. To use telemetry, a command-line argument must be
> used - "--vdev=telemetry".
> 

Why use a virtual device?

It seems that you are only using the device semantic as a way to carry
around a tag telling the DPDK framework to init your library once it has
finished its initialization.

I guess you wanted to avoid having to add the call to rte_telemetry_init
to all applications. In the absence of a proper EAL option framework,
you can workaround by adding a --telemetry EAL parameter, setting a flag
on, and checking this flag from librte_telemetry, within a routine
declared with RTE_INIT_PRIO.

I only see afterward the selftest being triggered via kvargs. I haven't
yet looked at the testing done, but if it is only unit test, the "test"
app would be better suited. If it is integration testing to verify the
behavior of the library with other PMDs, you probably need specific
context, thus selftest being insufficient on its own and useless for
other users.

> Control threads are used to get CPU cycles for telemetry, which
> are configured in this patch also.
> 
> Signed-off-by: Ciara Power 
> Signed-off-by: Brian Archbold 

Regards,
-- 
Gaëtan Rivet
6WIND


Re: [dpdk-dev] 18.08 build error on ppc64el - bool as vector type

2018-08-28 Thread Christian Ehrhardt
On Tue, Aug 28, 2018 at 1:44 PM Adrien Mazarguil 
wrote:

> On Tue, Aug 28, 2018 at 01:30:12PM +0200, Christian Ehrhardt wrote:
> > On Mon, Aug 27, 2018 at 2:22 PM Adrien Mazarguil <
> adrien.mazarg...@6wind.com>
> > wrote:
> >
> > > Hi Christian,
> > >
> > > On Wed, Aug 22, 2018 at 05:11:41PM +0200, Christian Ehrhardt wrote:
> > > > Just FYI the simple change hits similar issues later on.
> > > >
> > > > The (not really) proposed patch would have to be extended to be as
> > > > following.
> > > > We really need a better solution (or somebody has to convince me
> that my
> > > > change is better than a band aid).
> > >
> > > Thanks for reporting. I've made a quick investigation on my own and
> believe
> > > it's a toolchain issue which may affect more than this PMD;
> potentially all
> > > users of stdbool.h (C11) on this platform.
> > >
> >
> > Yeah I assumed as much, which is why I was hoping that some of the arch
> > experts would jump in and say "yeah this is a common thing and correctly
> > handled like "
> > I'll continue trying to reach out to people that should know better still
> > ...
> >
> >
> > > C11's stdbool.h defines a bool macro as _Bool (big B) along with
> > > true/false. On PPC targets, another file (altivec.h) defines bool as
> _bool
> > > (small b) but not true/false:
> > >
> > >  #if !defined(__APPLE_ALTIVEC__)
> > >  /* You are allowed to undef these for C++ compatibility.  */
> > >  #define vector __vector
> > >  #define pixel __pixel
> > >  #define bool __bool
> > >  #endif
> > >
> > > mlx5_nl.c explicitly includes stdbool.h to get the above definitions
> then
> > > includes mlx5.h -> rte_ether.h -> ppc_64/rte_memcpy.h -> altivec.h.
> > >
> > > For some reason the conflicting bool redefinition doesn't seem to
> raise any
> > > warnings, but results in mismatching bool and true/false definitions;
> an
> > > integer value cannot be assigned to a bool variable anymore, hence the
> > > build
> > > failure.
> > >
> > > The inability to assign integer values to bool is, in my opinion, a
> > > fundamental issue caused by altivec.h. If there is no way to fix this
> on
> > > the
> > > system, there are a couple of workarounds for DPDK, by order of
> preference:
> > >
> > > 1. Always #undef bool after including altivec.h in
> > >lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h. I do not
> think
> > >anyone expects this type to be unusable with true/false or integer
> > > values
> > >anyway. The version of altivec.h I have doesn't rely on this macro
> at
> > >all so it's probably not a big loss.
> > >
> >
> > The undef of a definition in header A by hedaer B can lead to most
> > interesting, still broken effects.
> > If e.g. one does
> > #include 
> > #include "mlx5.h"
> >
> > or similar then it would undefine that of stdbool as well right?
> > In any case, the undefine not only would be suspicious it also fails
> right
> > away:
> >
> > In file included from
> > /home/ubuntu/deb_dpdk/lib/librte_eal/common/malloc_heap.c:27:
> > /home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_memalloc.h:30:15:
> > error: unknown
> > type name ‘bool’; did you mean ‘_Bool’?
> >   int socket, bool exact);
> >   ^~~~
> >   _Bool
> > [...]
> >
> >
> >
> > >Ditto for "pixel" and "vector" keywords. Alternatively you could
> #define
> > >__APPLE_ALTIVEC__ before including altivec.h to prevent them from
> > > getting
> > >defined in the first place.
> > >
> >
> > Interesting I got plenty of these:
> > In file included from
> > /home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_common_options.c:25:
> > /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39:
> > warning:
> > "__APPLE_ALTIVEC__" redefined
> > #define __APPLE_ALTIVEC__
> >
> > With a few of it being even errors, but the position of the original
> define
> > is interesting.
> > /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39:
> error:
> > "__APPLE_ALTIVEC__" redefined [-Werror]
> > #define __APPLE_ALTIVEC__
> > : note: this is the location of the previous definition
> >
> > So if being a built-in, shouldn't it ALWAYS be defined and never
> > over-declare the bool type?
> >
> > Checking GCC on the platform:
> > $ gcc -dM -E - < /dev/null | grep ALTI
> > #define __ALTIVEC__ 1
> > #define __APPLE_ALTIVEC__ 1
> >
> >
> > I added an #error in the header and dropped all dpdk changes.
> > if !defined(__APPLE_ALTIVEC__)
> > /* You are allowed to undef these for C++ compatibility.  */
> > #error WOULD REDECLARE BOOL
> > #define vector __vector
> >
> > And I get:
> > gcc -Wp,-MD,./.mlx4.o.d.tmp -Wdate-time -D_FORTIFY_SOURCE=2 -m64 -pthread
> >   -DRTE_MACHINE_CPUFLAG_PPC64 -DRTE_MACHINE_CPUFLAG_ALTIVEC
> > -DRTE_MACHINE_CPUFLAG_VSX  -I/home/ubuntu/deb_dpdk/debia
> > n/build/static-root/include -include
> > /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_config.h -O3
> > -std=c11 -Wall -Wextra -g -I. -D_BSD_SOURCE -D_DEFAULT_SOURCE
> > -D_XOPEN_SOURCE=600
> > -W -W

[dpdk-dev] [dpdk-announce] DPDK Summit North America

2018-08-28 Thread O'Driscoll, Tim
This year's DPDK Summit North America has been confirmed for December 3rd and 
4th in Club Auto Sport in San Jose (the same venue as last year). Like last 
year it will be immediately followed by this year's OVS Conference 
(http://www.openvswitch.org/support/ovscon2018/), which is on December 5th and 
6th.

Registration for the event is now available at: 
https://events.linuxfoundation.org/events/dpdknorthamerica2018/dpdkattend/register/.
 The Early Bird registration fee is $99 and is available until September 30th. 
After that, the fee increases first to $150 and then to $200. There's a reduced 
rate for students and academics of $37.50.

The CFP is also open: 
https://events.linuxfoundation.org/events/dpdknorthamerica2018/dpdk-na-program/cfp/.
 Proposals are due by Monday October 22nd.

Further details on the event are available at: 
https://events.linuxfoundation.org/events/dpdknorthamerica2018/.



Re: [dpdk-dev] [RFC] net/virtio-user: avoid parsing process mappings

2018-08-28 Thread Burakov, Anatoly

On 28-Aug-18 9:42 AM, Tiwei Bie wrote:

On Tue, Aug 28, 2018 at 09:12:38AM +0100, Burakov, Anatoly wrote:

On 28-Aug-18 8:53 AM, Tiwei Bie wrote:

[...]
   
-	for (i = 0; i < num; ++i) {

-   mr = &msg->payload.memory.regions[i];
-   mr->guest_phys_addr = huges[i].addr; /* use vaddr! */
-   mr->userspace_addr = huges[i].addr;
-   mr->memory_size = huges[i].size;
-   mr->mmap_offset = 0;
-   fds[i] = open(huges[i].path, O_RDWR);
+   start_addr = (uint64_t)(uintptr_t)ms->addr;
+   end_addr = start_addr + ms->len;
+
+   /*
+* XXX workaround!
+*
+* For --single-file-segments case, offset should be:
+* offset = rte_fbarray_find_idx(&msl->memseg_arr, ms) * msl->page_sz;
+*
+* But it's not true for non-single-file-segments case.
+*
+* This is a temporary workaround which assumes the file will
+* also be mapped from the beginning in --single-file-segments
+* case. It should be replaced when we have a memory API to
+* get the offset.


Yes, this is an unfortunate consequence of having split personalities in
memory subsystem. A good solution would be to just deprecate
non-single-file segments mode and always use it, but we cannot do that
because we support kernel versions that do not support fallocate() on
hugetlbfs (and it still leaves legacy mem mode, which effectively
behaves like non-single-file segments as far as virtio is concerned).

How about we add this API in v2 for seg fd patchset?


Sure, thanks!

Besides, I saw another minor issue. When --legacy-mem
and --single-file-segments are both specified, the EAL
init doesn't fail, but fd can't be got successfully.



I'll have to look into this, because as far as i recall, i've submitted 
a patch specifically prohibiting legacy mem combined with single file 
segments. I'll double check though. Thanks for testing!


--
Thanks,
Anatoly


[dpdk-dev] [PATCH 0/3] security: support for pdcp

2018-08-28 Thread akhil . goyal
From: Akhil Goyal 

Security library currently only has support for IPSec protocol.
This patchset defines structures for pdcp protocol in rte_security
and provide a sample driver implementation for lookaside protocol
offload to support PDCP.

Akhil Goyal (3):
  security: support pdcp protocol
  crypto/dpaa2_sec: add sample pdcp descriptor apis
  crypto/dpaa2_sec: support pdcp offload

 doc/guides/prog_guide/rte_security.rst  |   90 +-
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c |  294 +++
 drivers/crypto/dpaa2_sec/dpaa2_sec_priv.h   |  172 ++
 drivers/crypto/dpaa2_sec/hw/desc.h  |2 +-
 drivers/crypto/dpaa2_sec/hw/desc/pdcp.h | 2642 +++
 lib/librte_security/rte_security.c  |4 +
 lib/librte_security/rte_security.h  |   62 +
 7 files changed, 3258 insertions(+), 8 deletions(-)
 create mode 100644 drivers/crypto/dpaa2_sec/hw/desc/pdcp.h

-- 
2.17.1



[dpdk-dev] [PATCH 1/3] security: support pdcp protocol

2018-08-28 Thread akhil . goyal
From: Akhil Goyal 

Signed-off-by: Hemant Agrawal 
Signed-off-by: Akhil Goyal 
---
 doc/guides/prog_guide/rte_security.rst | 90 --
 lib/librte_security/rte_security.c |  4 ++
 lib/librte_security/rte_security.h | 62 ++
 3 files changed, 149 insertions(+), 7 deletions(-)

diff --git a/doc/guides/prog_guide/rte_security.rst 
b/doc/guides/prog_guide/rte_security.rst
index 0812abe77..412fff016 100644
--- a/doc/guides/prog_guide/rte_security.rst
+++ b/doc/guides/prog_guide/rte_security.rst
@@ -10,8 +10,8 @@ The security library provides a framework for management and 
provisioning
 of security protocol operations offloaded to hardware based devices. The
 library defines generic APIs to create and free security sessions which can
 support full protocol offload as well as inline crypto operation with
-NIC or crypto devices. The framework currently only supports the IPSec protocol
-and associated operations, other protocols will be added in future.
+NIC or crypto devices. The framework currently only supports the IPSec and PDCP
+protocol and associated operations, other protocols will be added in future.
 
 Design Principles
 -
@@ -253,6 +253,46 @@ for any protocol header addition.
 +|+
  V
 
+PDCP Flow Diagram
+~
+
+.. code-block:: c
+
+Transmitting PDCP Entity  Receiving PDCP Entity
+  |   ^
+  |   +---|---+
+  V   | In order delivery and |
++-|--+| Duplicate detection   |
+| Sequence Numbering ||  (Data Plane only)|
++-|--++---|---+
+  |   |
++-|--++---|--+
+| Header Compression*|| Header Decompression*|
+| (Data-Plane only)  ||   (Data Plane only)  |
++-|--++---|--+
+  |   |
++-|---+   +---|--+
+| Integrity Protection|   |Integrity Verification|
+| (Control Plane only)|   | (Control Plane only) |
++-|---+   +---|--+
++-|---++--|--+
+| Ciphering   || Deciphering |
++-|---++--|--+
++-|---++--|--+
+|   Add PDCP header   || Remove PDCP Header  |
++-|---++--|--+
+  |   |
+  +->>+
+
+
+.. note::
+
+* Header Compression and decompression are not supported currently.
+
+Just like IPSec, in case of PDCP also header addition/deletion, cipher/
+de-cipher, integrity protection/verification is done based on the action
+type chosen.
+
 Device Features and Capabilities
 -
 
@@ -271,7 +311,7 @@ structure in the *DPDK API Reference*.
 
 Each driver (crypto or ethernet) defines its own private array of capabilities
 for the operations it supports. Below is an example of the capabilities for a
-PMD which supports the IPSec protocol.
+PMD which supports the IPSec and PDCP protocol.
 
 .. code-block:: c
 
@@ -298,6 +338,22 @@ PMD which supports the IPSec protocol.
 },
 .crypto_capabilities = pmd_capabilities
 },
+{ /* PDCP Lookaside Protocol offload Data Plane */
+.action = RTE_SECURITY_ACTION_TYPE_LOOKASIDE_PROTOCOL,
+.protocol = RTE_SECURITY_PROTOCOL_PDCP,
+.pdcp = {
+.domain = RTE_SECURITY_PDCP_MODE_DATA,
+},
+.crypto_capabilities = pmd_capabilities
+},
+{ /* PDCP Lookaside Protocol offload Control */
+.action = RTE_SECURITY_ACTION_TYPE_LOOKASIDE_PROTOCOL,
+.protocol = RTE_SECURITY_PROTOCOL_PDCP,
+.pdcp = {
+.domain = RTE_SECURITY_PDCP_MODE_CONTROL,
+},
+.crypto_capabilities = pmd_capabilities
+},
 {
 .action = RTE_SECURITY_ACTION_TYPE_NONE
 }
@@ -429,6 +485,7 @@ Security Session configuration structure is defined as 
``rte_security_session_co
 union {
 struct rte_security_ipsec_xform ipsec;
 struct rte_security_macsec_xform macsec;
+struct rte_security_pdcp_xform pdcp;
 };
 /**< Configuration parameters for security session 

[dpdk-dev] [PATCH 3/3] crypto/dpaa2_sec: support pdcp offload

2018-08-28 Thread akhil . goyal
From: Akhil Goyal 

Signed-off-by: Hemant Agrawal 
Signed-off-by: Akhil Goyal 
---
 drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c | 294 
 drivers/crypto/dpaa2_sec/dpaa2_sec_priv.h   | 172 
 2 files changed, 466 insertions(+)

diff --git a/drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c 
b/drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c
index 2a3c61c66..ef4e1ab37 100644
--- a/drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c
+++ b/drivers/crypto/dpaa2_sec/dpaa2_sec_dpseci.c
@@ -35,6 +35,7 @@ typedef uint64_t  dma_addr_t;
 
 /* RTA header files */
 #include 
+#include 
 #include 
 
 /* Minimum job descriptor consists of a oneword job descriptor HEADER and
@@ -61,6 +62,70 @@ static uint8_t cryptodev_driver_id;
 
 int dpaa2_logtype_sec;
 
+static inline int
+build_proto_compound_fd(dpaa2_sec_session *sess,
+  struct rte_crypto_op *op,
+  struct qbman_fd *fd, uint16_t bpid)
+{
+   struct rte_crypto_sym_op *sym_op = op->sym;
+   struct ctxt_priv *priv = sess->ctxt;
+   struct qbman_fle *fle, *ip_fle, *op_fle;
+   struct sec_flow_context *flc;
+   struct rte_mbuf *src_mbuf = sym_op->m_src;
+   struct rte_mbuf *dst_mbuf = sym_op->m_dst;
+   int retval;
+
+   if (!dst_mbuf)
+   dst_mbuf = src_mbuf;
+
+   /* Save the shared descriptor */
+   flc = &priv->flc_desc[0].flc;
+
+   /* we are using the first FLE entry to store Mbuf */
+   retval = rte_mempool_get(priv->fle_pool, (void **)(&fle));
+   if (retval) {
+   DPAA2_SEC_ERR("Memory alloc failed");
+   return -1;
+   }
+   memset(fle, 0, FLE_POOL_BUF_SIZE);
+   DPAA2_SET_FLE_ADDR(fle, (size_t)op);
+   DPAA2_FLE_SAVE_CTXT(fle, (ptrdiff_t)priv);
+
+   op_fle = fle + 1;
+   ip_fle = fle + 2;
+
+   if (likely(bpid < MAX_BPID)) {
+   DPAA2_SET_FD_BPID(fd, bpid);
+   DPAA2_SET_FLE_BPID(op_fle, bpid);
+   DPAA2_SET_FLE_BPID(ip_fle, bpid);
+   } else {
+   DPAA2_SET_FD_IVP(fd);
+   DPAA2_SET_FLE_IVP(op_fle);
+   DPAA2_SET_FLE_IVP(ip_fle);
+   }
+
+   /* Configure FD as a FRAME LIST */
+   DPAA2_SET_FD_ADDR(fd, DPAA2_VADDR_TO_IOVA(op_fle));
+   DPAA2_SET_FD_COMPOUND_FMT(fd);
+   DPAA2_SET_FD_FLC(fd, (ptrdiff_t)flc);
+
+   /* Configure Output FLE with dst mbuf data  */
+   DPAA2_SET_FLE_ADDR(op_fle, DPAA2_MBUF_VADDR_TO_IOVA(dst_mbuf));
+   DPAA2_SET_FLE_OFFSET(op_fle, dst_mbuf->data_off);
+   DPAA2_SET_FLE_LEN(op_fle, dst_mbuf->buf_len);
+
+   /* Configure Input FLE with src mbuf data */
+   DPAA2_SET_FLE_ADDR(ip_fle, DPAA2_MBUF_VADDR_TO_IOVA(src_mbuf));
+   DPAA2_SET_FLE_OFFSET(ip_fle, src_mbuf->data_off);
+   DPAA2_SET_FLE_LEN(ip_fle, src_mbuf->pkt_len);
+
+   DPAA2_SET_FD_LEN(fd, ip_fle->length);
+   DPAA2_SET_FLE_FIN(ip_fle);
+
+   return 0;
+
+}
+
 static inline int
 build_proto_fd(dpaa2_sec_session *sess,
   struct rte_crypto_op *op,
@@ -1124,6 +1189,9 @@ build_sec_fd(struct rte_crypto_op *op,
case DPAA2_SEC_IPSEC:
ret = build_proto_fd(sess, op, fd, bpid);
break;
+   case DPAA2_SEC_PDCP:
+   ret = build_proto_compound_fd(sess, op, fd, bpid);
+   break;
case DPAA2_SEC_HASH_CIPHER:
default:
DPAA2_SEC_ERR("error: Unsupported session");
@@ -2375,6 +2443,228 @@ dpaa2_sec_set_ipsec_session(struct rte_cryptodev *dev,
return -1;
 }
 
+static int
+dpaa2_sec_set_pdcp_session(struct rte_cryptodev *dev,
+  struct rte_security_session_conf *conf,
+  void *sess)
+{
+   struct rte_security_pdcp_xform *pdcp_xform = &conf->pdcp;
+   struct rte_crypto_sym_xform *xform = conf->crypto_xform;
+   struct rte_crypto_auth_xform *auth_xform = NULL;
+   struct rte_crypto_cipher_xform *cipher_xform;
+   dpaa2_sec_session *session = (dpaa2_sec_session *)sess;
+   struct ctxt_priv *priv;
+   struct dpaa2_sec_dev_private *dev_priv = dev->data->dev_private;
+   struct alginfo authdata, cipherdata;
+   int bufsize = -1;
+   struct sec_flow_context *flc;
+
+   PMD_INIT_FUNC_TRACE();
+
+   memset(session, 0, sizeof(dpaa2_sec_session));
+
+   priv = (struct ctxt_priv *)rte_zmalloc(NULL,
+   sizeof(struct ctxt_priv) +
+   sizeof(struct sec_flc_desc),
+   RTE_CACHE_LINE_SIZE);
+
+   if (priv == NULL) {
+   DPAA2_SEC_ERR("No memory for priv CTXT");
+   return -ENOMEM;
+   }
+
+   priv->fle_pool = dev_priv->fle_pool;
+   flc = &priv->flc_desc[0].flc;
+
+   /* find xfrm types */
+   if (xform->type == RTE_CRYPTO_SYM_XFORM_CIPHER && xform->next == NULL) {
+   cipher_xform = &xform->ciphe

[dpdk-dev] [PATCH 2/3] crypto/dpaa2_sec: add sample pdcp descriptor apis

2018-08-28 Thread akhil . goyal
From: Akhil Goyal 

Signed-off-by: Hemant Agrawal 
Signed-off-by: Horia Geanta Neag 
Signed-off-by: Alex Porosanu 
Signed-off-by: Akhil Goyal 
---
 drivers/crypto/dpaa2_sec/hw/desc.h  |2 +-
 drivers/crypto/dpaa2_sec/hw/desc/pdcp.h | 2642 +++
 2 files changed, 2643 insertions(+), 1 deletion(-)
 create mode 100644 drivers/crypto/dpaa2_sec/hw/desc/pdcp.h

diff --git a/drivers/crypto/dpaa2_sec/hw/desc.h 
b/drivers/crypto/dpaa2_sec/hw/desc.h
index e92558329..f0e575922 100644
--- a/drivers/crypto/dpaa2_sec/hw/desc.h
+++ b/drivers/crypto/dpaa2_sec/hw/desc.h
@@ -1332,7 +1332,7 @@
 #define OP_PCL_LTE_MIXED_AUTH_SHIFT0
 #define OP_PCL_LTE_MIXED_AUTH_MASK (3 << OP_PCL_LTE_MIXED_AUTH_SHIFT)
 #define OP_PCL_LTE_MIXED_ENC_SHIFT 8
-#define OP_PCL_LTE_MIXED_ENC_MASK  (3 < OP_PCL_LTE_MIXED_ENC_SHIFT)
+#define OP_PCL_LTE_MIXED_ENC_MASK  (3 << OP_PCL_LTE_MIXED_ENC_SHIFT)
 #define OP_PCL_LTE_MIXED_AUTH_NULL (OP_PCL_LTE_NULL << \
 OP_PCL_LTE_MIXED_AUTH_SHIFT)
 #define OP_PCL_LTE_MIXED_AUTH_SNOW (OP_PCL_LTE_SNOW << \
diff --git a/drivers/crypto/dpaa2_sec/hw/desc/pdcp.h 
b/drivers/crypto/dpaa2_sec/hw/desc/pdcp.h
new file mode 100644
index 0..4e218f513
--- /dev/null
+++ b/drivers/crypto/dpaa2_sec/hw/desc/pdcp.h
@@ -0,0 +1,2642 @@
+/*
+ * Copyright 2008-2013 Freescale Semiconductor, Inc.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause or GPL-2.0+
+ */
+
+#ifndef __DESC_PDCP_H__
+#define __DESC_PDCP_H__
+
+#include "hw/rta.h"
+#include "common.h"
+
+/**
+ * DOC: PDCP Shared Descriptor Constructors
+ *
+ * Shared descriptors for PDCP protocol.
+ */
+
+/**
+ * PDCP_NULL_MAX_FRAME_LEN - The maximum frame frame length that is supported 
by
+ *   PDCP NULL protocol.
+ */
+#define PDCP_NULL_MAX_FRAME_LEN0x2FFF
+
+/**
+ * PDCP_MAC_I_LEN - The length of the MAC-I for PDCP protocol operation
+ */
+#define PDCP_MAC_I_LEN 0x0004
+
+/**
+ * PDCP_MAX_FRAME_LEN_STATUS - The status returned in FD status/command field 
in
+ * case the input frame is larger than
+ * PDCP_NULL_MAX_FRAME_LEN.
+ */
+#define PDCP_MAX_FRAME_LEN_STATUS  0xF1
+
+/**
+ * PDCP_C_PLANE_SN_MASK - This mask is used in the PDCP descriptors for
+ *extracting the sequence number (SN) from the PDCP
+ *Control Plane header. For PDCP Control Plane, the SN
+ *is constant (5 bits) as opposed to PDCP Data Plane
+ *(7/12/15 bits).
+ */
+#define PDCP_C_PLANE_SN_MASK   0x001F
+
+/**
+ * PDCP_U_PLANE_15BIT_SN_MASK - This mask is used in the PDCP descriptors for
+ *  extracting the sequence number (SN) from the
+ *  PDCP User Plane header. For PDCP Control Plane,
+ *  the SN is constant (5 bits) as opposed to PDCP
+ *  Data Plane (7/12/15 bits).
+ */
+#define PDCP_U_PLANE_15BIT_SN_MASK 0x7FFF
+
+/**
+ * PDCP_BEARER_MASK - This mask is used masking out the bearer for PDCP
+ *processing with SNOW f9 in LTE.
+ *
+ * The value on which this mask is applied is formatted as below:
+ * Count-C (32 bit) | Bearer (5 bit) | Direction (1 bit) | 0 (26 bits)
+ *
+ * Applying this mask is done for creating the upper 64 bits of the IV needed
+ * for SNOW f9.
+ *
+ * The lower 32 bits of the mask are used for masking the direction for AES
+ * CMAC IV.
+ */
+#define PDCP_BEARER_MASK   0x0400ull
+
+/**
+ * PDCP_DIR_MASK - This mask is used masking out the direction for PDCP
+ * processing with SNOW f9 in LTE.
+ *
+ * The value on which this mask is applied is formatted as below:
+ * Bearer (5 bit) | Direction (1 bit) | 0 (26 bits)
+ *
+ * Applying this mask is done for creating the lower 32 bits of the IV needed
+ * for SNOW f9.
+ *
+ * The upper 32 bits of the mask are used for masking the direction for AES
+ * CMAC IV.
+ */
+#define PDCP_DIR_MASK  0xF800ull
+
+/**
+ * PDCP_NULL_INT_MAC_I_VAL - The value of the PDCP PDU MAC-I in case NULL
+ *   integrity is used.
+ */
+
+#define PDCP_NULL_INT_MAC_I_VAL0x
+
+/**
+ * PDCP_NULL_INT_ICV_CHECK_FAILED_STATUS - The status used to report ICV check
+ * failed in case of NULL integrity
+ * Control Plane processing.
+ */
+#define PDCP_NULL_INT_ICV_CHECK_FAILED_STATUS  0x0A
+/**
+ * PDCP_DPOVRD_HFN_OV_EN - Value to be used in the FD status/cmd field to
+ * indicate the HFN override mechanism is active for 
the
+ * frame.
+ */
+#define PDCP_DPOVRD_HFN_OV_EN  0x8000
+
+/**
+ * PDCP_P4080REV2_HFN_OV_BUFLEN - The length in bytes of the supplementary

[dpdk-dev] IXGBE throughput loss with 4+ cores

2018-08-28 Thread Saber Rezvani

Hi,


I have run multi_process/symmetric_mp example in DPDK example directory.
For a one process its throughput is line rate but as I increase the
number of cores I see decrease in throughput. For example, If the number
of queues set to 4 and each queue assigns to a single core, then the
throughput will be something about 9.4. if 8 queues, then throughput
will be 8.5.

I have read the following, but it was not convincing.

http://mails.dpdk.org/archives/dev/2015-October/024960.html


I am eagerly looking forward to hearing from you, all.


Best wishes,

Saber




Re: [dpdk-dev] [PATCH] net/virtio-user: fix memory hotplug support

2018-08-28 Thread Sean Harte
On Mon, 27 Aug 2018 at 10:15, Burakov, Anatoly
 wrote:
>
> On 24-Aug-18 4:51 PM, Sean Harte wrote:
> > On Fri, 24 Aug 2018 at 16:19, Burakov, Anatoly
> >  wrote:
> >>
> >> On 24-Aug-18 11:41 AM, Burakov, Anatoly wrote:
> >>> On 24-Aug-18 10:35 AM, Tiwei Bie wrote:
>  On Fri, Aug 24, 2018 at 09:59:42AM +0100, Burakov, Anatoly wrote:
> > On 24-Aug-18 5:49 AM, Tiwei Bie wrote:
> >> On Thu, Aug 23, 2018 at 03:01:30PM +0100, Burakov, Anatoly wrote:
> >>> On 23-Aug-18 12:19 PM, Sean Harte wrote:
>  On Thu, 23 Aug 2018 at 10:05, Burakov, Anatoly
>   wrote:
> >
> > On 23-Aug-18 3:57 AM, Tiwei Bie wrote:
> >> Deadlock can occur when allocating memory if a vhost-kernel
> >> based virtio-user device is in use. Besides, it's possible
> >> to have much more than 64 non-contiguous hugepage backed
> >> memory regions due to the memory hotplug, which may cause
> >> problems when handling VHOST_SET_MEM_TABLE request. A better
> >> solution is to have the virtio-user pass all the VA ranges
> >> reserved by DPDK to vhost-kernel.
> >>
> >> Bugzilla ID: 81
> >> Fixes: 12ecb2f63b12 ("net/virtio-user: support memory hotplug")
> >> Cc: sta...@dpdk.org
> >>
> >> Reported-by: Seán Harte 
> >> Signed-off-by: Tiwei Bie 
> >> ---
> >>   drivers/net/virtio/virtio_user/vhost_kernel.c | 64
> >> ---
> >>   1 file changed, 27 insertions(+), 37 deletions(-)
> >>
> >> diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c
> >> b/drivers/net/virtio/virtio_user/vhost_kernel.c
> >> index b2444096c..49bd1b821 100644
> >> --- a/drivers/net/virtio/virtio_user/vhost_kernel.c
> >> +++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
> >> @@ -70,41 +70,12 @@ static uint64_t vhost_req_user_to_kernel[] = {
> >>   [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
> >>   };
> >>
> >> -struct walk_arg {
> >> - struct vhost_memory_kernel *vm;
> >> - uint32_t region_nr;
> >> -};
> >> -static int
> >> -add_memory_region(const struct rte_memseg_list *msl __rte_unused,
> >> - const struct rte_memseg *ms, size_t len, void *arg)
> >> -{
> >> - struct walk_arg *wa = arg;
> >> - struct vhost_memory_region *mr;
> >> - void *start_addr;
> >> -
> >> - if (wa->region_nr >= max_regions)
> >> - return -1;
> >> -
> >> - mr = &wa->vm->regions[wa->region_nr++];
> >> - start_addr = ms->addr;
> >> -
> >> - mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
> >> - mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
> >> - mr->memory_size = len;
> >> - mr->mmap_offset = 0;
> >> -
> >> - return 0;
> >> -}
> >> -
> >> -/* By default, vhost kernel module allows 64 regions, but DPDK
> >> allows
> >> - * 256 segments. As a relief, below function merges those
> >> virtually
> >> - * adjacent memsegs into one region.
> >> - */
> >>   static struct vhost_memory_kernel *
> >>   prepare_vhost_memory_kernel(void)
> >>   {
> >> + struct rte_mem_config *mcfg =
> >> rte_eal_get_configuration()->mem_config;
> >>   struct vhost_memory_kernel *vm;
> >> - struct walk_arg wa;
> >> + uint32_t region_nr = 0, i;
> >>
> >>   vm = malloc(sizeof(struct vhost_memory_kernel) +
> >>   max_regions *
> >> @@ -112,15 +83,34 @@ prepare_vhost_memory_kernel(void)
> >>   if (!vm)
> >>   return NULL;
> >>
> >> - wa.region_nr = 0;
> >> - wa.vm = vm;
> >> + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
> >> + struct rte_memseg_list *msl = &mcfg->memsegs[i];
> >> + struct vhost_memory_region *mr;
> >> + void *start_addr;
> >> + uint64_t len;
> >
> > There is a rte_memseg_list_walk() - please do not iterate over
> > memseg
> > lists manually.
> >
> 
>  rte_memseg_list_walk() can't be used here because
>  prepare_vhost_memory_kernel() is sometimes called from a memory
>  callback. It will then hang trying to get a read lock on
>  memory_hotplug_lock.
> >>>
> >>> OK, so use rte_memseg_list_walk_thread_unsafe().
> >>>
>  I don't think the rte_memseg_list_walk_thread_unsafe() function is
>  appropriate because prepare_vhost_memory_kernel() may not always be
> >

Re: [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP

2018-08-28 Thread Zhang, Qi Z
Hi William:

> -Original Message-
> From: William Tu [mailto:u9012...@gmail.com]
> Sent: Friday, August 24, 2018 12:25 AM
> To: Zhang, Qi Z 
> Cc: dev@dpdk.org; Karlsson, Magnus ; Topel,
> Bjorn ; Wu, Jingjing ; Li,
> Xiaoyun ; Yigit, Ferruh 
> Subject: Re: [dpdk-dev] [PATCH v3 0/6] PMD driver for AF_XDP
> 
> Hi Zhang Qi,
> 
> I'm not familiar with DPDK code, but I'm curious about the benefits of using
> AF_XDP pmd, specifically I have a couple questions:
> 
> 1) With zero-copy driver support, is AF_XDP pmd expects to have similar
> performance than other pmd? 

Zero-copy will improve performance a lot, but it still have gap with native 
DPDK PMD.
basically it's kind of less performance but more flexible solution.

BTW, Patches to enable zero copy for i40e just be published by Bjorn, there is 
some performance data for your reference.
http://lists.openwall.net/netdev/2018/08/28/62


> Since AF_XDP is still using native device driver,
> isn't the interrupt still there and not "poll-mode"
> anymore?

Yes, it's still napi->poll triggered by interrupt.

> 
> 2) does the patch expect user to customize the ebpf/xdp code so that this
> becomes another way to extend dpdk datapath?

Yes, this provides another option to use kernel's eBPF eco-system for packet 
filtering,.
And it will be easy for us to develop some tool to load/link/expose ebpf as 
part of DPDK I think.

According to AF_XDP PMD, my view is since DPDK is very popular, it is becoming 
some standard way to develop network applications.
So a DPDK PMD is going to be a bridge for developers to take advantage of the 
AF_XDP technology if compared to deal with the XDP Socket and libc directly.

Regards
Qi

> 
> Thank you
> William
> 
> On Thu, Aug 16, 2018 at 7:42 AM Qi Zhang  wrote:
> >
> > Overview
> > 
> >
> > The patch set add a new PMD driver for AF_XDP which is a proposed
> > faster version of AF_PACKET interface in Linux, see below link for
> > detail AF_XDP introduction:
> > https://lwn.net/Articles/750845/
> > https://fosdem.org/2018/schedule/event/af_xdp/
> >
> > AF_XDP roadmap
> > ==
> > - The kernel 4.18 is out and af_xdp is included.
> >   https://kernelnewbies.org/Linux_4.18
> > - So far there is no zero copy supported driver be merged, but some are
> >   on the way.
> >
> > Change logs
> > ===
> >
> > v3:
> > - Re-work base on AF_XDP's interface changes.
> > - Support multi-queues, each dpdk queue has its own xdp socket.
> >   An xdp socket is always bound to a netdev queue.
> >   We assume all xdp socket from the same ethdev are bound to the
> >   same netdev queue, though a netdev queue still can be bound by
> >   xdp sockets from different ethdev instances.
> >   Below is an example of the mapping.
> >   --
> >   | dpdk q0 | dpdk q1 | dpdk q0| dpdk q0 | dpdk q1 |
> >   --
> >   | xsk A   | xsk B   | xsk C  | xsk D   | xsk E   |<---|
> >   --|
> >   |  ETHDEV 0 | ETHDEV 1   |  ETHDEV 2 ||
> DPDK
> >   --
> >   |  netdev queue 0|   netdev queue 1  ||
> KERNEL
> >   --|
> >   |  NETDEV eth0   ||
> >   --|
> >   |key   xsk   ||
> >   |  --   --   ||
> >   |  | |  | 0  | xsk A |   ||
> >   |  | |  --   ||
> >   |  | |  | 2  | xsk B |   ||
> >   |  | ebpf|  ---
> >   |  | |  | 3  | xsk C |   |
> >   |  | redirect ->|--  |
> >   |  | |  | 4  | xsk D |   |
> >   |  | |  --   |
> >   |  |-|  | 5  | xsk E |   |
> >   |   --   |
> >   |-
> >
> > - It is an open question that how to load ebpf to kernel and link to
> >   specific netdev in DPDK, should it be part of PMD, or it should be
> handled by
> >   an independent tool? In this patchset, it takes the second option, there
> will
> >   be a "bind" stage before we start AF_XDP PMD, this includes below
> steps:
> >   a) load ebpf program to the kernel, (the ebpf program must contain the
> >  logic to redirect packet to a xdp socket base on a redirect map).
> >   b) link ebpf program to specific network interface.
> >   c) expose the xdp socket redirect map id and entries number to user,
> >  so this will be parsed to PMD, and PMD will create xdp socket
> >

Re: [dpdk-dev] [PATCH] net/virtio-user: fix memory hotplug support

2018-08-28 Thread Burakov, Anatoly

On 28-Aug-18 2:27 PM, Sean Harte wrote:

On Mon, 27 Aug 2018 at 10:15, Burakov, Anatoly
 wrote:


On 24-Aug-18 4:51 PM, Sean Harte wrote:

On Fri, 24 Aug 2018 at 16:19, Burakov, Anatoly
 wrote:


On 24-Aug-18 11:41 AM, Burakov, Anatoly wrote:

On 24-Aug-18 10:35 AM, Tiwei Bie wrote:

On Fri, Aug 24, 2018 at 09:59:42AM +0100, Burakov, Anatoly wrote:

On 24-Aug-18 5:49 AM, Tiwei Bie wrote:

On Thu, Aug 23, 2018 at 03:01:30PM +0100, Burakov, Anatoly wrote:

On 23-Aug-18 12:19 PM, Sean Harte wrote:

On Thu, 23 Aug 2018 at 10:05, Burakov, Anatoly
 wrote:


On 23-Aug-18 3:57 AM, Tiwei Bie wrote:

Deadlock can occur when allocating memory if a vhost-kernel
based virtio-user device is in use. Besides, it's possible
to have much more than 64 non-contiguous hugepage backed
memory regions due to the memory hotplug, which may cause
problems when handling VHOST_SET_MEM_TABLE request. A better
solution is to have the virtio-user pass all the VA ranges
reserved by DPDK to vhost-kernel.

Bugzilla ID: 81
Fixes: 12ecb2f63b12 ("net/virtio-user: support memory hotplug")
Cc: sta...@dpdk.org

Reported-by: Seán Harte 
Signed-off-by: Tiwei Bie 
---
   drivers/net/virtio/virtio_user/vhost_kernel.c | 64
---
   1 file changed, 27 insertions(+), 37 deletions(-)

diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c
b/drivers/net/virtio/virtio_user/vhost_kernel.c
index b2444096c..49bd1b821 100644
--- a/drivers/net/virtio/virtio_user/vhost_kernel.c
+++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
@@ -70,41 +70,12 @@ static uint64_t vhost_req_user_to_kernel[] = {
   [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
   };

-struct walk_arg {
- struct vhost_memory_kernel *vm;
- uint32_t region_nr;
-};
-static int
-add_memory_region(const struct rte_memseg_list *msl __rte_unused,
- const struct rte_memseg *ms, size_t len, void *arg)
-{
- struct walk_arg *wa = arg;
- struct vhost_memory_region *mr;
- void *start_addr;
-
- if (wa->region_nr >= max_regions)
- return -1;
-
- mr = &wa->vm->regions[wa->region_nr++];
- start_addr = ms->addr;
-
- mr->guest_phys_addr = (uint64_t)(uintptr_t)start_addr;
- mr->userspace_addr = (uint64_t)(uintptr_t)start_addr;
- mr->memory_size = len;
- mr->mmap_offset = 0;
-
- return 0;
-}
-
-/* By default, vhost kernel module allows 64 regions, but DPDK
allows
- * 256 segments. As a relief, below function merges those
virtually
- * adjacent memsegs into one region.
- */
   static struct vhost_memory_kernel *
   prepare_vhost_memory_kernel(void)
   {
+ struct rte_mem_config *mcfg =
rte_eal_get_configuration()->mem_config;
   struct vhost_memory_kernel *vm;
- struct walk_arg wa;
+ uint32_t region_nr = 0, i;

   vm = malloc(sizeof(struct vhost_memory_kernel) +
   max_regions *
@@ -112,15 +83,34 @@ prepare_vhost_memory_kernel(void)
   if (!vm)
   return NULL;

- wa.region_nr = 0;
- wa.vm = vm;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ struct vhost_memory_region *mr;
+ void *start_addr;
+ uint64_t len;


There is a rte_memseg_list_walk() - please do not iterate over
memseg
lists manually.



rte_memseg_list_walk() can't be used here because
prepare_vhost_memory_kernel() is sometimes called from a memory
callback. It will then hang trying to get a read lock on
memory_hotplug_lock.


OK, so use rte_memseg_list_walk_thread_unsafe().


I don't think the rte_memseg_list_walk_thread_unsafe() function is
appropriate because prepare_vhost_memory_kernel() may not always be
called from a memory callback.


And how is this different? What you're doing here is identical to
calling
rte_memseg_list_walk_thread_unsafe() (that's precisely what it does
internally - check the code!), except that you're doing it manually
and not
using DPDK API, which makes your code dependent on internals of DPDK's
memory implementation.

So, this function may or may not be called from a callback, but
you're using
thread-unsafe walk anyway. I think you should call either
thread-safe or
thread-unsafe version depending on whether you're being called from a
callback or not.


Hmm, the real case is a bit more tricky. Even if this
function isn't called from memory event callbacks, the
"thread-safe" version list_walk() still can't be used.
Because deadlock may happen.

In virtio-user device start, it needs to do SET_MEM_TABLE
for the vhost-backend. And to make sure that preparing and
setting the memory table is atomic (and also to protect the
device state), it needs a lock. So if it calls "thread-safe"
version list_walk(), there will be two locks taken in
below order:

- the virtio-user device lock (taken by virtio_user_start_device());
- the memory hotplug lock (taken by rte_memseg_list_walk());

And above locks will be released in below ord

[dpdk-dev] [PATCH v1] testpmd: add new command for show port info

2018-08-28 Thread Emma Finn
existing testpmd command "show port info" is too verbose.
Added a new summary command to print brief information on ports.

Signed-off-by: Emma Finn 
---
 app/test-pmd/cmdline.c | 15 ++-
 app/test-pmd/config.c  | 41 +
 app/test-pmd/testpmd.h |  2 ++
 3 files changed, 53 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 app/test-pmd/cmdline.c
 mode change 100644 => 100755 app/test-pmd/config.c
 mode change 100644 => 100755 app/test-pmd/testpmd.h

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
old mode 100644
new mode 100755
index 589121d..21e0b7a
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -167,7 +167,7 @@ static void cmd_help_long_parsed(void *parsed_result,
"Display:\n"
"\n\n"
 
-   "show port 
(info|stats|xstats|fdir|stat_qmap|dcb_tc|cap) (port_id|all)\n"
+   "show port 
(info|stats|summary|xstats|fdir|stat_qmap|dcb_tc|cap) (port_id|all)\n"
"Display information for port_id, or all.\n\n"
 
"show port X rss reta (size) (mask0,mask1,...)\n"
@@ -7073,6 +7073,9 @@ static void cmd_showportall_parsed(void *parsed_result,
} else if (!strcmp(res->what, "info"))
RTE_ETH_FOREACH_DEV(i)
port_infos_display(i);
+   else if (!strcmp(res->what, "summary"))
+   RTE_ETH_FOREACH_DEV(i)
+   port_summary_display(i,1);
else if (!strcmp(res->what, "stats"))
RTE_ETH_FOREACH_DEV(i)
nic_stats_display(i);
@@ -7100,14 +7103,14 @@ cmdline_parse_token_string_t cmd_showportall_port =
TOKEN_STRING_INITIALIZER(struct cmd_showportall_result, port, "port");
 cmdline_parse_token_string_t cmd_showportall_what =
TOKEN_STRING_INITIALIZER(struct cmd_showportall_result, what,
-"info#stats#xstats#fdir#stat_qmap#dcb_tc#cap");
+
"info#summary#stats#xstats#fdir#stat_qmap#dcb_tc#cap");
 cmdline_parse_token_string_t cmd_showportall_all =
TOKEN_STRING_INITIALIZER(struct cmd_showportall_result, all, "all");
 cmdline_parse_inst_t cmd_showportall = {
.f = cmd_showportall_parsed,
.data = NULL,
.help_str = "show|clear port "
-   "info|stats|xstats|fdir|stat_qmap|dcb_tc|cap all",
+   "info|summary|stats|xstats|fdir|stat_qmap|dcb_tc|cap all",
.tokens = {
(void *)&cmd_showportall_show,
(void *)&cmd_showportall_port,
@@ -7137,6 +7140,8 @@ static void cmd_showport_parsed(void *parsed_result,
nic_xstats_clear(res->portnum);
} else if (!strcmp(res->what, "info"))
port_infos_display(res->portnum);
+   else if (!strcmp(res->what, "summary"))
+   port_summary_display(res->portnum,0);
else if (!strcmp(res->what, "stats"))
nic_stats_display(res->portnum);
else if (!strcmp(res->what, "xstats"))
@@ -7158,7 +7163,7 @@ cmdline_parse_token_string_t cmd_showport_port =
TOKEN_STRING_INITIALIZER(struct cmd_showport_result, port, "port");
 cmdline_parse_token_string_t cmd_showport_what =
TOKEN_STRING_INITIALIZER(struct cmd_showport_result, what,
-"info#stats#xstats#fdir#stat_qmap#dcb_tc#cap");
+
"info#summary#stats#xstats#fdir#stat_qmap#dcb_tc#cap");
 cmdline_parse_token_num_t cmd_showport_portnum =
TOKEN_NUM_INITIALIZER(struct cmd_showport_result, portnum, UINT16);
 
@@ -7166,7 +7171,7 @@ cmdline_parse_inst_t cmd_showport = {
.f = cmd_showport_parsed,
.data = NULL,
.help_str = "show|clear port "
-   "info|stats|xstats|fdir|stat_qmap|dcb_tc|cap "
+   "info|summary|stats|xstats|fdir|stat_qmap|dcb_tc|cap "
"",
.tokens = {
(void *)&cmd_showport_show,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
old mode 100644
new mode 100755
index 14ccd68..3e049ca
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -518,6 +518,35 @@ port_infos_display(portid_t port_id)
 }
 
 void
+port_summary_display(portid_t port_id, int all)
+{
+   struct ether_addr mac_addr;
+   struct rte_eth_link link;
+   struct rte_eth_dev_info dev_info;
+   int port_number;
+   char name[RTE_ETH_NAME_MAX_LEN];
+
+   if (port_id_is_invalid(port_id, ENABLED_WARN)) {
+   print_valid_ports();
+   return;
+   }
+
+   rte_eth_link_get_nowait(port_id, &link);
+   memset(&dev_info, 0, sizeof(dev_info));
+   rte_eth_dev_info_get(port_id, &dev_info);
+   rte_eth_dev_get_name_by_port(port_id, name);
+   rte_eth_macaddr_get(port_id, &mac_addr);
+   port_number = get_valid_ports();
+   if(all==1 && port_id ==0)
+   p

Re: [dpdk-dev] 18.08 build error on ppc64el - bool as vector type

2018-08-28 Thread Adrien Mazarguil
On Tue, Aug 28, 2018 at 02:38:35PM +0200, Christian Ehrhardt wrote:
> On Tue, Aug 28, 2018 at 1:44 PM Adrien Mazarguil 
> wrote:
> 
> > On Tue, Aug 28, 2018 at 01:30:12PM +0200, Christian Ehrhardt wrote:
> > > On Mon, Aug 27, 2018 at 2:22 PM Adrien Mazarguil <
> > adrien.mazarg...@6wind.com>
> > > wrote:
> > >
> > > > Hi Christian,
> > > >
> > > > On Wed, Aug 22, 2018 at 05:11:41PM +0200, Christian Ehrhardt wrote:
> > > > > Just FYI the simple change hits similar issues later on.
> > > > >
> > > > > The (not really) proposed patch would have to be extended to be as
> > > > > following.
> > > > > We really need a better solution (or somebody has to convince me
> > that my
> > > > > change is better than a band aid).
> > > >
> > > > Thanks for reporting. I've made a quick investigation on my own and
> > believe
> > > > it's a toolchain issue which may affect more than this PMD;
> > potentially all
> > > > users of stdbool.h (C11) on this platform.
> > > >
> > >
> > > Yeah I assumed as much, which is why I was hoping that some of the arch
> > > experts would jump in and say "yeah this is a common thing and correctly
> > > handled like "
> > > I'll continue trying to reach out to people that should know better still
> > > ...
> > >
> > >
> > > > C11's stdbool.h defines a bool macro as _Bool (big B) along with
> > > > true/false. On PPC targets, another file (altivec.h) defines bool as
> > _bool
> > > > (small b) but not true/false:
> > > >
> > > >  #if !defined(__APPLE_ALTIVEC__)
> > > >  /* You are allowed to undef these for C++ compatibility.  */
> > > >  #define vector __vector
> > > >  #define pixel __pixel
> > > >  #define bool __bool
> > > >  #endif
> > > >
> > > > mlx5_nl.c explicitly includes stdbool.h to get the above definitions
> > then
> > > > includes mlx5.h -> rte_ether.h -> ppc_64/rte_memcpy.h -> altivec.h.
> > > >
> > > > For some reason the conflicting bool redefinition doesn't seem to
> > raise any
> > > > warnings, but results in mismatching bool and true/false definitions;
> > an
> > > > integer value cannot be assigned to a bool variable anymore, hence the
> > > > build
> > > > failure.
> > > >
> > > > The inability to assign integer values to bool is, in my opinion, a
> > > > fundamental issue caused by altivec.h. If there is no way to fix this
> > on
> > > > the
> > > > system, there are a couple of workarounds for DPDK, by order of
> > preference:
> > > >
> > > > 1. Always #undef bool after including altivec.h in
> > > >lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h. I do not
> > think
> > > >anyone expects this type to be unusable with true/false or integer
> > > > values
> > > >anyway. The version of altivec.h I have doesn't rely on this macro
> > at
> > > >all so it's probably not a big loss.
> > > >
> > >
> > > The undef of a definition in header A by hedaer B can lead to most
> > > interesting, still broken effects.
> > > If e.g. one does
> > > #include 
> > > #include "mlx5.h"
> > >
> > > or similar then it would undefine that of stdbool as well right?
> > > In any case, the undefine not only would be suspicious it also fails
> > right
> > > away:
> > >
> > > In file included from
> > > /home/ubuntu/deb_dpdk/lib/librte_eal/common/malloc_heap.c:27:
> > > /home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_memalloc.h:30:15:
> > > error: unknown
> > > type name ‘bool’; did you mean ‘_Bool’?
> > >   int socket, bool exact);
> > >   ^~~~
> > >   _Bool
> > > [...]
> > >
> > >
> > >
> > > >Ditto for "pixel" and "vector" keywords. Alternatively you could
> > #define
> > > >__APPLE_ALTIVEC__ before including altivec.h to prevent them from
> > > > getting
> > > >defined in the first place.
> > > >
> > >
> > > Interesting I got plenty of these:
> > > In file included from
> > > /home/ubuntu/deb_dpdk/lib/librte_eal/common/eal_common_options.c:25:
> > > /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39:
> > > warning:
> > > "__APPLE_ALTIVEC__" redefined
> > > #define __APPLE_ALTIVEC__
> > >
> > > With a few of it being even errors, but the position of the original
> > define
> > > is interesting.
> > > /home/ubuntu/deb_dpdk/debian/build/static-root/include/rte_memcpy.h:39:
> > error:
> > > "__APPLE_ALTIVEC__" redefined [-Werror]
> > > #define __APPLE_ALTIVEC__
> > > : note: this is the location of the previous definition
> > >
> > > So if being a built-in, shouldn't it ALWAYS be defined and never
> > > over-declare the bool type?
> > >
> > > Checking GCC on the platform:
> > > $ gcc -dM -E - < /dev/null | grep ALTI
> > > #define __ALTIVEC__ 1
> > > #define __APPLE_ALTIVEC__ 1
> > >
> > >
> > > I added an #error in the header and dropped all dpdk changes.
> > > if !defined(__APPLE_ALTIVEC__)
> > > /* You are allowed to undef these for C++ compatibility.  */
> > > #error WOULD REDECLARE BOOL
> > > #define vector __vector
> > >
> > > And I get:
> > > gcc -Wp,-MD,./.mlx4.o.d.tmp -Wdate-time -D_FORTIFY_SOURCE=2 -m64 -pthre

[dpdk-dev] [PATCH 0/8] Add Marvell NETA PMD

2018-08-28 Thread Andrzej Ostruszka
This patch series introduces new PMD for Marvell NETA adapters (MVNETA).
See the documentation for more info.

It is split for easier reviewing.

Natalie Samsonov (4):
  net/mvneta: add neta PMD skeleton
  net/mvneta: support for setting of MTU
  net/mvneta: add link update
  net/mvneta: add reset statistics callback

Yelena Krivosheev (4):
  net/mvneta: add Rx/Tx support
  net/mvneta: support for promiscuous
  net/mvneta: add MAC filtering
  net/mvneta: add support for basic stats

 MAINTAINERS   |8 +
 config/common_base|5 +
 devtools/test-build.sh|2 +
 doc/guides/nics/features/mvneta.ini   |   18 +
 doc/guides/nics/mvneta.rst|  182 +++
 doc/guides/rel_notes/release_18_11.rst|4 +
 drivers/common/Makefile   |4 +-
 drivers/common/mvep/rte_mvep_common.h |1 +
 drivers/net/Makefile  |1 +
 drivers/net/meson.build   |1 +
 drivers/net/mvneta/Makefile   |   42 +
 drivers/net/mvneta/meson.build|   27 +
 drivers/net/mvneta/mvneta_ethdev.c| 2023 +
 drivers/net/mvneta/mvneta_ethdev.h|   90 ++
 drivers/net/mvneta/rte_pmd_mvneta_version.map |3 +
 mk/rte.app.mk |7 +-
 16 files changed, 2415 insertions(+), 3 deletions(-)
 create mode 100644 doc/guides/nics/features/mvneta.ini
 create mode 100644 doc/guides/nics/mvneta.rst
 create mode 100644 drivers/net/mvneta/Makefile
 create mode 100644 drivers/net/mvneta/meson.build
 create mode 100644 drivers/net/mvneta/mvneta_ethdev.c
 create mode 100644 drivers/net/mvneta/mvneta_ethdev.h
 create mode 100644 drivers/net/mvneta/rte_pmd_mvneta_version.map

-- 
2.7.4



[dpdk-dev] [PATCH 1/8] net/mvneta: add neta PMD skeleton

2018-08-28 Thread Andrzej Ostruszka
From: Natalie Samsonov 

Add neta pmd driver skeleton providing base for the further
development.

Signed-off-by: Natalie Samsonov 
Signed-off-by: Yelena Krivosheev 
Signed-off-by: Dmitri Epshtein 
Signed-off-by: Zyta Szpak 
Signed-off-by: Andrzej Ostruszka 
---
 MAINTAINERS   |   8 +
 config/common_base|   5 +
 devtools/test-build.sh|   2 +
 doc/guides/nics/features/mvneta.ini   |  18 +
 doc/guides/nics/mvneta.rst| 182 ++
 doc/guides/rel_notes/release_18_11.rst|   4 +
 drivers/common/Makefile   |   4 +-
 drivers/common/mvep/rte_mvep_common.h |   1 +
 drivers/net/Makefile  |   1 +
 drivers/net/meson.build   |   1 +
 drivers/net/mvneta/Makefile   |  42 ++
 drivers/net/mvneta/meson.build|  27 +
 drivers/net/mvneta/mvneta_ethdev.c| 883 ++
 drivers/net/mvneta/mvneta_ethdev.h|  78 +++
 drivers/net/mvneta/rte_pmd_mvneta_version.map |   3 +
 mk/rte.app.mk |   7 +-
 16 files changed, 1263 insertions(+), 3 deletions(-)
 create mode 100644 doc/guides/nics/features/mvneta.ini
 create mode 100644 doc/guides/nics/mvneta.rst
 create mode 100644 drivers/net/mvneta/Makefile
 create mode 100644 drivers/net/mvneta/meson.build
 create mode 100644 drivers/net/mvneta/mvneta_ethdev.c
 create mode 100644 drivers/net/mvneta/mvneta_ethdev.h
 create mode 100644 drivers/net/mvneta/rte_pmd_mvneta_version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 9fd258f..18858b9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -585,6 +585,14 @@ F: drivers/net/mvpp2/
 F: doc/guides/nics/mvpp2.rst
 F: doc/guides/nics/features/mvpp2.ini
 
+Marvell mvneta
+M: Zyta Szpak 
+M: Dmitri Epshtein 
+M: Natalie Samsonov 
+F: drivers/net/mvneta/
+F: doc/guides/nics/mvneta.rst
+F: doc/guides/nics/features/mvneta.ini
+
 Mellanox mlx4
 M: Matan Azrad 
 M: Shahaf Shuler 
diff --git a/config/common_base b/config/common_base
index 155c7d4..1f8410b 100644
--- a/config/common_base
+++ b/config/common_base
@@ -400,6 +400,11 @@ CONFIG_RTE_LIBRTE_PMD_FAILSAFE=y
 CONFIG_RTE_LIBRTE_MVPP2_PMD=n
 
 #
+# Compile Marvell MVNETA PMD driver
+#
+CONFIG_RTE_LIBRTE_MVNETA_PMD=n
+
+#
 # Compile support for VMBus library
 #
 CONFIG_RTE_LIBRTE_VMBUS=n
diff --git a/devtools/test-build.sh b/devtools/test-build.sh
index 1eee241..2990978 100755
--- a/devtools/test-build.sh
+++ b/devtools/test-build.sh
@@ -182,6 +182,8 @@ config () #   
sed -ri's,(PMD_MVSAM_CRYPTO=)n,\1y,' $1/.config
test -z "$LIBMUSDK_PATH" || \
sed -ri  's,(MVPP2_PMD=)n,\1y,' $1/.config
+   test -z "$LIBMUSDK_PATH" || \
+   sed -ri  's,(MVNETA_PMD=)n,\1y,' $1/.config
build_config_hook $1 $2 $3
 
# Explicit enabler/disabler (uppercase)
diff --git a/doc/guides/nics/features/mvneta.ini 
b/doc/guides/nics/features/mvneta.ini
new file mode 100644
index 000..3f02f84
--- /dev/null
+++ b/doc/guides/nics/features/mvneta.ini
@@ -0,0 +1,18 @@
+;
+; Supported features of the 'mvneta' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Speed capabilities   = Y
+Link status  = Y
+MTU update   = Y
+Jumbo frame  = Y
+Promiscuous mode = Y
+CRC offload  = Y
+L3 checksum offload  = Y
+L4 checksum offload  = Y
+Packet type parsing  = Y
+Basic stats  = Y
+ARMv8= Y
+Usage doc= Y
diff --git a/doc/guides/nics/mvneta.rst b/doc/guides/nics/mvneta.rst
new file mode 100644
index 000..6e9f511
--- /dev/null
+++ b/doc/guides/nics/mvneta.rst
@@ -0,0 +1,182 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+Copyright(c) 2018 Marvell International Ltd.
+Copyright(c) 2018 Semihalf.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+  * Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived
+from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR 

[dpdk-dev] [PATCH 2/8] net/mvneta: add Rx/Tx support

2018-08-28 Thread Andrzej Ostruszka
From: Yelena Krivosheev 

Add part of PMD for actual reception/transmission.

Signed-off-by: Yelena Krivosheev 
Signed-off-by: Dmitri Epshtein 
Signed-off-by: Zyta Szpak 
---
 drivers/net/mvneta/mvneta_ethdev.c | 795 +
 drivers/net/mvneta/mvneta_ethdev.h |  11 +
 2 files changed, 806 insertions(+)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index db43f39..a480b14 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -27,6 +27,10 @@
 
 #define MVNETA_IFACE_NAME_ARG "iface"
 
+#define MVNETA_COOKIE_ADDR_INVALID ~0ULL
+
+#define MVNETA_COOKIE_HIGH_ADDR_SHIFT  (sizeof(neta_cookie_t) * 8)
+#define MVNETA_COOKIE_HIGH_ADDR_MASK   (~0ULL << MVNETA_COOKIE_HIGH_ADDR_SHIFT)
 
 #define MVNETA_MUSDK_DMA_MEMSIZE 41943040 /* (40 * 1024 * 1024) */
 
@@ -49,6 +53,19 @@
 
 #define MVNETA_PKT_EFFEC_OFFS (MRVL_NETA_PKT_OFFS + MV_MH_SIZE)
 
+static uint64_t cookie_addr_high = MVNETA_COOKIE_ADDR_INVALID;
+static uint16_t rx_desc_free_thresh = MRVL_NETA_BUF_RELEASE_BURST_SIZE_MIN;
+
+#define MVNETA_SET_COOKIE_HIGH_ADDR(addr) {\
+   if (unlikely(cookie_addr_high == MVNETA_COOKIE_ADDR_INVALID))   \
+   cookie_addr_high =  \
+   (uint64_t)addr & MVNETA_COOKIE_HIGH_ADDR_MASK;  \
+}
+
+#define MVNETA_CHECK_COOKIE_HIGH_ADDR(addr)\
+   ((likely(cookie_addr_high ==\
+   ((uint64_t)addr & MVNETA_COOKIE_HIGH_ADDR_MASK))) ? 1 : 0)
+
 int mvneta_logtype;
 
 static const char * const valid_args[] = {
@@ -61,6 +78,24 @@ struct mvneta_ifnames {
int idx;
 };
 
+/*
+ * To use buffer harvesting based on loopback port shadow queue structure
+ * was introduced for buffers information bookkeeping.
+ *
+ * Before sending the packet, related buffer information is
+ * stored in shadow queue. After packet is transmitted no longer used
+ * packet buffer is released back to it's original hardware pool,
+ * on condition it originated from interface.
+ * In case it  was generated by application itself i.e: mbuf->port field is
+ * 0xff then its released to software mempool.
+ */
+struct mvneta_shadow_txq {
+   int head;   /* write index - used when sending buffers */
+   int tail;   /* read index - used when releasing buffers */
+   u16 size;   /* queue occupied size */
+   struct neta_buff_inf ent[MRVL_NETA_TX_SHADOWQ_SIZE]; /* q entries */
+};
+
 
 struct mvneta_rxq {
struct mvneta_priv *priv;
@@ -80,6 +115,7 @@ struct mvneta_txq {
int queue_id;
int port_id;
uint64_t bytes_sent;
+   struct mvneta_shadow_txq shadow_txq;
int tx_deferred_start;
 };
 
@@ -87,6 +123,248 @@ static int mvneta_dev_num;
 static int mvneta_lcore_first;
 static int mvneta_lcore_last;
 
+static inline void
+mvneta_fill_shadowq(struct mvneta_shadow_txq *sq, struct rte_mbuf *buf)
+{
+   sq->ent[sq->head].cookie = (uint64_t)buf;
+   sq->ent[sq->head].addr = buf ?
+   rte_mbuf_data_iova_default(buf) : 0;
+
+   sq->head = (sq->head + 1) & MRVL_NETA_TX_SHADOWQ_MASK;
+   sq->size++;
+}
+
+static inline void
+mvneta_fill_desc(struct neta_ppio_desc *desc, struct rte_mbuf *buf)
+{
+   neta_ppio_outq_desc_reset(desc);
+   neta_ppio_outq_desc_set_phys_addr(desc, rte_pktmbuf_iova(buf));
+   neta_ppio_outq_desc_set_pkt_offset(desc, 0);
+   neta_ppio_outq_desc_set_pkt_len(desc, rte_pktmbuf_data_len(buf));
+}
+
+static inline int
+mvneta_buffs_refill(struct mvneta_priv *priv, struct mvneta_rxq *rxq, u16 *num)
+{
+   struct rte_mbuf *mbufs[MRVL_NETA_BUF_RELEASE_BURST_SIZE_MAX];
+   struct neta_buff_inf entries[MRVL_NETA_BUF_RELEASE_BURST_SIZE_MAX];
+   int i, ret;
+   uint16_t nb_desc = *num;
+
+   ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, nb_desc);
+   if (ret) {
+   MVNETA_LOG(ERR, "Failed to allocate %u mbufs.\n", nb_desc);
+   *num = 0;
+   return -1;
+   }
+
+   MVNETA_SET_COOKIE_HIGH_ADDR(mbufs[0]);
+
+   for (i = 0; i < nb_desc; i++) {
+   if (unlikely(!MVNETA_CHECK_COOKIE_HIGH_ADDR(mbufs[i]))) {
+   MVNETA_LOG(ERR,
+   "mbuf virt high addr 0x%lx out of range 
0x%lx\n",
+   (uint64_t)mbufs[i] >> 32,
+   cookie_addr_high >> 32);
+   *num = 0;
+   goto out;
+   }
+   entries[i].addr = rte_mbuf_data_iova_default(mbufs[i]);
+   entries[i].cookie = (neta_cookie_t)(uint64_t)mbufs[i];
+   }
+   neta_ppio_inq_put_buffs(priv->ppio, rxq->queue_id, entries, num);
+
+out:
+   for (i = *num; i < nb_desc; i++)
+   rte_pktmbuf_free(mbufs[i]);
+
+   return 0;
+}
+
+
+/**
+ * Allocate buffers from mempool
+ * and store a

[dpdk-dev] [PATCH 5/8] net/mvneta: support for promiscuous

2018-08-28 Thread Andrzej Ostruszka
From: Yelena Krivosheev 

Add callbacks for enabling/disabling of promiscuous mode.

Signed-off-by: Yelena Krivosheev 
---
 drivers/net/mvneta/mvneta_ethdev.c | 54 ++
 1 file changed, 54 insertions(+)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index d1c180c..3236e93 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -1534,6 +1534,58 @@ mvneta_link_update(struct rte_eth_dev *dev, int 
wait_to_complete __rte_unused)
 }
 
 /**
+ * DPDK callback to enable promiscuous mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mvneta_promiscuous_enable(struct rte_eth_dev *dev)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   int ret, en;
+
+   if (!priv->ppio)
+   return;
+
+   neta_ppio_get_promisc(priv->ppio, &en);
+   if (en) {
+   MVNETA_LOG(INFO, "Promiscuous already enabled\n");
+   return;
+   }
+
+   ret = neta_ppio_set_promisc(priv->ppio, 1);
+   if (ret)
+   MVNETA_LOG(ERR, "Failed to enable promiscuous mode\n");
+}
+
+/**
+ * DPDK callback to disable allmulticast mode.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mvneta_promiscuous_disable(struct rte_eth_dev *dev)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   int ret, en;
+
+   if (!priv->ppio)
+   return;
+
+   neta_ppio_get_promisc(priv->ppio, &en);
+   if (!en) {
+   MVNETA_LOG(INFO, "Promiscuous already disabled\n");
+   return;
+   }
+
+   ret = neta_ppio_set_promisc(priv->ppio, 0);
+   if (ret)
+   MVNETA_LOG(ERR, "Failed to disable promiscuous mode\n");
+}
+
+/**
  * DPDK callback to set the primary MAC address.
  *
  * @param dev
@@ -1567,6 +1619,8 @@ static const struct eth_dev_ops mvneta_ops = {
.dev_set_link_down = mvneta_dev_set_link_down,
.dev_close = mvneta_dev_close,
.link_update = mvneta_link_update,
+   .promiscuous_enable = mvneta_promiscuous_enable,
+   .promiscuous_disable = mvneta_promiscuous_disable,
.mac_addr_set = mvneta_mac_addr_set,
.mtu_set = mvneta_mtu_set,
.dev_infos_get = mvneta_dev_infos_get,
-- 
2.7.4



[dpdk-dev] [PATCH 7/8] net/mvneta: add support for basic stats

2018-08-28 Thread Andrzej Ostruszka
From: Yelena Krivosheev 

Add support for getting of basic statistics for the driver.

Signed-off-by: Yelena Krivosheev 
Signed-off-by: Natalie Samsonov 
---
 drivers/net/mvneta/mvneta_ethdev.c | 47 ++
 1 file changed, 47 insertions(+)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index f19d2ea..9bff6c3 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -1678,6 +1678,52 @@ mvneta_mac_addr_set(struct rte_eth_dev *dev, struct 
ether_addr *mac_addr)
return 0;
 }
 
+/**
+ * DPDK callback to get device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param stats
+ *   Stats structure output buffer.
+ *
+ * @return
+ *   0 on success, negative error value otherwise.
+ */
+static int
+mvneta_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   struct neta_ppio_statistics ppio_stats;
+   unsigned int ret;
+
+   if (!priv->ppio)
+   return -EPERM;
+
+   ret = neta_ppio_get_statistics(priv->ppio, &ppio_stats);
+   if (unlikely(ret)) {
+   MVNETA_LOG(ERR, "Failed to update port statistics\n");
+   return ret;
+   }
+
+   stats->ipackets += ppio_stats.rx_packets +
+   ppio_stats.rx_broadcast_packets +
+   ppio_stats.rx_multicast_packets;
+   stats->opackets += ppio_stats.tx_packets +
+   ppio_stats.tx_broadcast_packets +
+   ppio_stats.tx_multicast_packets;
+   stats->ibytes += ppio_stats.rx_bytes;
+   stats->obytes += ppio_stats.tx_bytes;
+   stats->imissed += ppio_stats.rx_discard +
+ ppio_stats.rx_overrun;
+
+   stats->ierrors = ppio_stats.rx_packets_err +
+   ppio_stats.rx_errors +
+   ppio_stats.rx_crc_error;
+   stats->oerrors = ppio_stats.tx_errors;
+
+   return 0;
+}
+
 static const struct eth_dev_ops mvneta_ops = {
.dev_configure = mvneta_dev_configure,
.dev_start = mvneta_dev_start,
@@ -1692,6 +1738,7 @@ static const struct eth_dev_ops mvneta_ops = {
.mac_addr_add = mvneta_mac_addr_add,
.mac_addr_set = mvneta_mac_addr_set,
.mtu_set = mvneta_mtu_set,
+   .stats_get = mvneta_stats_get,
.dev_infos_get = mvneta_dev_infos_get,
.dev_supported_ptypes_get = mvneta_dev_supported_ptypes_get,
.rxq_info_get = mvneta_rxq_info_get,
-- 
2.7.4



[dpdk-dev] [PATCH 6/8] net/mvneta: add MAC filtering

2018-08-28 Thread Andrzej Ostruszka
From: Yelena Krivosheev 

Add callbacks for adding/removing MAC addresses.

Signed-off-by: Yelena Krivosheev 
Signed-off-by: Natalie Samsonov 
---
 drivers/net/mvneta/mvneta_ethdev.c | 69 ++
 1 file changed, 69 insertions(+)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index 3236e93..f19d2ea 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -1586,6 +1586,73 @@ mvneta_promiscuous_disable(struct rte_eth_dev *dev)
 }
 
 /**
+ * DPDK callback to remove a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param index
+ *   MAC address index.
+ */
+static void
+mvneta_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   char buf[ETHER_ADDR_FMT_SIZE];
+   int ret;
+
+   if (!priv->ppio)
+   return;
+
+   ret = neta_ppio_remove_mac_addr(priv->ppio,
+  dev->data->mac_addrs[index].addr_bytes);
+   if (ret) {
+   ether_format_addr(buf, sizeof(buf),
+ &dev->data->mac_addrs[index]);
+   MVNETA_LOG(ERR, "Failed to remove mac %s\n", buf);
+   }
+}
+
+/**
+ * DPDK callback to add a MAC address.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ * @param vmdq
+ *   VMDq pool index to associate address with (unused).
+ *
+ * @return
+ *   0 on success, negative error value otherwise.
+ */
+static int
+mvneta_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
+ uint32_t index, uint32_t vmdq __rte_unused)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   char buf[ETHER_ADDR_FMT_SIZE];
+   int ret;
+
+   if (index == 0)
+   /* For setting index 0, mrvl_mac_addr_set() should be used.*/
+   return -1;
+
+   if (!priv->ppio)
+   return 0;
+
+   ret = neta_ppio_add_mac_addr(priv->ppio, mac_addr->addr_bytes);
+   if (ret) {
+   ether_format_addr(buf, sizeof(buf), mac_addr);
+   MVNETA_LOG(ERR, "Failed to add mac %s\n", buf);
+   return -1;
+   }
+
+   return 0;
+}
+
+/**
  * DPDK callback to set the primary MAC address.
  *
  * @param dev
@@ -1621,6 +1688,8 @@ static const struct eth_dev_ops mvneta_ops = {
.link_update = mvneta_link_update,
.promiscuous_enable = mvneta_promiscuous_enable,
.promiscuous_disable = mvneta_promiscuous_disable,
+   .mac_addr_remove = mvneta_mac_addr_remove,
+   .mac_addr_add = mvneta_mac_addr_add,
.mac_addr_set = mvneta_mac_addr_set,
.mtu_set = mvneta_mtu_set,
.dev_infos_get = mvneta_dev_infos_get,
-- 
2.7.4



[dpdk-dev] [PATCH 4/8] net/mvneta: add link update

2018-08-28 Thread Andrzej Ostruszka
From: Natalie Samsonov 

Add callback for updating information about link status/info.

Signed-off-by: Natalie Samsonov 
---
 drivers/net/mvneta/mvneta_ethdev.c | 71 ++
 1 file changed, 71 insertions(+)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index eded4eb..d1c180c 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -1464,6 +1464,76 @@ mvneta_dev_close(struct rte_eth_dev *dev)
 }
 
 /**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ *   Wait for request completion (ignored).
+ *
+ * @return
+ *   0 on success, negative error value otherwise.
+ */
+static int
+mvneta_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
+{
+   /*
+* TODO
+* once MUSDK provides necessary API use it here
+*/
+   struct mvneta_priv *priv = dev->data->dev_private;
+   struct ethtool_cmd edata;
+   struct ifreq req;
+   int ret, fd, link_up;
+
+   if (!priv->ppio)
+   return -EPERM;
+
+   edata.cmd = ETHTOOL_GSET;
+
+   strcpy(req.ifr_name, dev->data->name);
+   req.ifr_data = (void *)&edata;
+
+   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   if (fd == -1)
+   return -EFAULT;
+   ret = ioctl(fd, SIOCETHTOOL, &req);
+   if (ret == -1) {
+   close(fd);
+   return -EFAULT;
+   }
+
+   close(fd);
+
+   switch (ethtool_cmd_speed(&edata)) {
+   case SPEED_10:
+   dev->data->dev_link.link_speed = ETH_SPEED_NUM_10M;
+   break;
+   case SPEED_100:
+   dev->data->dev_link.link_speed = ETH_SPEED_NUM_100M;
+   break;
+   case SPEED_1000:
+   dev->data->dev_link.link_speed = ETH_SPEED_NUM_1G;
+   break;
+   case SPEED_2500:
+   dev->data->dev_link.link_speed = ETH_SPEED_NUM_2_5G;
+   break;
+   default:
+   dev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
+   }
+
+   dev->data->dev_link.link_duplex = edata.duplex ? ETH_LINK_FULL_DUPLEX :
+ETH_LINK_HALF_DUPLEX;
+   dev->data->dev_link.link_autoneg = edata.autoneg ? ETH_LINK_AUTONEG :
+  ETH_LINK_FIXED;
+
+   neta_ppio_get_link_state(priv->ppio, &link_up);
+   dev->data->dev_link.link_status = link_up ? ETH_LINK_UP : ETH_LINK_DOWN;
+
+   return 0;
+}
+
+/**
  * DPDK callback to set the primary MAC address.
  *
  * @param dev
@@ -1496,6 +1566,7 @@ static const struct eth_dev_ops mvneta_ops = {
.dev_set_link_up = mvneta_dev_set_link_up,
.dev_set_link_down = mvneta_dev_set_link_down,
.dev_close = mvneta_dev_close,
+   .link_update = mvneta_link_update,
.mac_addr_set = mvneta_mac_addr_set,
.mtu_set = mvneta_mtu_set,
.dev_infos_get = mvneta_dev_infos_get,
-- 
2.7.4



[dpdk-dev] [PATCH 3/8] net/mvneta: support for setting of MTU

2018-08-28 Thread Andrzej Ostruszka
From: Natalie Samsonov 

Add callback for setting of MTU.

Signed-off-by: Natalie Samsonov 
Signed-off-by: Zyta Szpak 
---
 drivers/net/mvneta/mvneta_ethdev.c | 78 ++
 1 file changed, 78 insertions(+)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index a480b14..eded4eb 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -1032,6 +1032,77 @@ static void mvneta_txq_info_get(struct rte_eth_dev *dev, 
uint16_t tx_queue_id,
 }
 
 /**
+ * DPDK callback to change the MTU.
+ *
+ * Setting the MTU affects hardware MRU (packets larger than the MRU
+ * will be dropped).
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mtu
+ *   New MTU.
+ *
+ * @return
+ *   0 on success, negative error value otherwise.
+ */
+static int
+mvneta_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   uint16_t mbuf_data_size = 0; /* SW buffer size */
+   uint16_t mru;
+   int ret;
+
+   mru = MRVL_NETA_MTU_TO_MRU(mtu);
+   /*
+* min_rx_buf_size is equal to mbuf data size
+* if pmd didn't set it differently
+*/
+   mbuf_data_size = dev->data->min_rx_buf_size - RTE_PKTMBUF_HEADROOM;
+   /* Prevent PMD from:
+* - setting mru greater than the mbuf size resulting in
+* hw and sw buffer size mismatch
+* - setting mtu that requires the support of scattered packets
+* when this feature has not been enabled/supported so far.
+*/
+   if (!dev->data->scattered_rx &&
+   (mru + MRVL_NETA_PKT_OFFS > mbuf_data_size)) {
+   mru = mbuf_data_size - MRVL_NETA_PKT_OFFS;
+   mtu = MRVL_NETA_MRU_TO_MTU(mru);
+   MVNETA_LOG(WARNING, "MTU too big, max MTU possible limitted by"
+   " current mbuf size: %u. Set MTU to %u, MRU to %u\n",
+   mbuf_data_size, mtu, mru);
+   }
+
+   if (mtu < ETHER_MIN_MTU || mru > MVNETA_PKT_SIZE_MAX) {
+   MVNETA_LOG(ERR, "Invalid MTU [%u] or MRU [%u]\n", mtu, mru);
+   return -EINVAL;
+   }
+
+   dev->data->mtu = mtu;
+   dev->data->dev_conf.rxmode.max_rx_pkt_len = mru - MV_MH_SIZE;
+
+   if (!priv->ppio)
+   /* It is OK. New MTU will be set later on mvneta_dev_start */
+   return 0;
+
+   ret = neta_ppio_set_mru(priv->ppio, mru);
+   if (ret) {
+   MVNETA_LOG(ERR, "Failed to change MRU\n");
+   return ret;
+   }
+
+   ret = neta_ppio_set_mtu(priv->ppio, mtu);
+   if (ret) {
+   MVNETA_LOG(ERR, "Failed to change MTU\n");
+   return ret;
+   }
+   MVNETA_LOG(INFO, "MTU changed to %u, MRU = %u\n", mtu, mru);
+
+   return 0;
+}
+
+/**
  * DPDK callback to bring the link up.
  *
  * @param dev
@@ -1305,6 +1376,12 @@ mvneta_dev_start(struct rte_eth_dev *dev)
}
}
 
+   ret = mvneta_mtu_set(dev, dev->data->mtu);
+   if (ret) {
+   MVNETA_LOG(ERR, "Failed to set MTU %d\n", dev->data->mtu);
+   goto out;
+   }
+
ret = mvneta_dev_set_link_up(dev);
if (ret) {
MVNETA_LOG(ERR, "Failed to set link up\n");
@@ -1420,6 +1497,7 @@ static const struct eth_dev_ops mvneta_ops = {
.dev_set_link_down = mvneta_dev_set_link_down,
.dev_close = mvneta_dev_close,
.mac_addr_set = mvneta_mac_addr_set,
+   .mtu_set = mvneta_mtu_set,
.dev_infos_get = mvneta_dev_infos_get,
.dev_supported_ptypes_get = mvneta_dev_supported_ptypes_get,
.rxq_info_get = mvneta_rxq_info_get,
-- 
2.7.4



[dpdk-dev] [PATCH 8/8] net/mvneta: add reset statistics callback

2018-08-28 Thread Andrzej Ostruszka
From: Natalie Samsonov 

Add support for reseting of driver statistics.

Singed-off-by: Natalie Samsonov 
---
 drivers/net/mvneta/mvneta_ethdev.c | 40 +++---
 drivers/net/mvneta/mvneta_ethdev.h |  1 +
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/drivers/net/mvneta/mvneta_ethdev.c 
b/drivers/net/mvneta/mvneta_ethdev.c
index 9bff6c3..9bad2e3 100644
--- a/drivers/net/mvneta/mvneta_ethdev.c
+++ b/drivers/net/mvneta/mvneta_ethdev.c
@@ -1707,23 +1707,48 @@ mvneta_stats_get(struct rte_eth_dev *dev, struct 
rte_eth_stats *stats)
 
stats->ipackets += ppio_stats.rx_packets +
ppio_stats.rx_broadcast_packets +
-   ppio_stats.rx_multicast_packets;
+   ppio_stats.rx_multicast_packets -
+   priv->prev_stats.ipackets;
stats->opackets += ppio_stats.tx_packets +
ppio_stats.tx_broadcast_packets +
-   ppio_stats.tx_multicast_packets;
-   stats->ibytes += ppio_stats.rx_bytes;
-   stats->obytes += ppio_stats.tx_bytes;
+   ppio_stats.tx_multicast_packets -
+   priv->prev_stats.opackets;
+   stats->ibytes += ppio_stats.rx_bytes - priv->prev_stats.ibytes;
+   stats->obytes += ppio_stats.tx_bytes - priv->prev_stats.obytes;
stats->imissed += ppio_stats.rx_discard +
- ppio_stats.rx_overrun;
+ ppio_stats.rx_overrun -
+ priv->prev_stats.imissed;
 
stats->ierrors = ppio_stats.rx_packets_err +
ppio_stats.rx_errors +
-   ppio_stats.rx_crc_error;
-   stats->oerrors = ppio_stats.tx_errors;
+   ppio_stats.rx_crc_error -
+   priv->prev_stats.ierrors;
+   stats->oerrors = ppio_stats.tx_errors - priv->prev_stats.oerrors;
 
return 0;
 }
 
+/**
+ * DPDK callback to clear device statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ */
+static void
+mvneta_stats_reset(struct rte_eth_dev *dev)
+{
+   struct mvneta_priv *priv = dev->data->dev_private;
+   unsigned int ret;
+
+   if (!priv->ppio)
+   return;
+
+   ret = mvneta_stats_get(dev, &priv->prev_stats);
+   if (unlikely(ret))
+   RTE_LOG(ERR, PMD, "Failed to reset port statistics\n");
+}
+
+
 static const struct eth_dev_ops mvneta_ops = {
.dev_configure = mvneta_dev_configure,
.dev_start = mvneta_dev_start,
@@ -1739,6 +1764,7 @@ static const struct eth_dev_ops mvneta_ops = {
.mac_addr_set = mvneta_mac_addr_set,
.mtu_set = mvneta_mtu_set,
.stats_get = mvneta_stats_get,
+   .stats_reset = mvneta_stats_reset,
.dev_infos_get = mvneta_dev_infos_get,
.dev_supported_ptypes_get = mvneta_dev_supported_ptypes_get,
.rxq_info_get = mvneta_rxq_info_get,
diff --git a/drivers/net/mvneta/mvneta_ethdev.h 
b/drivers/net/mvneta/mvneta_ethdev.h
index a05566d..6aed170 100644
--- a/drivers/net/mvneta/mvneta_ethdev.h
+++ b/drivers/net/mvneta/mvneta_ethdev.h
@@ -77,6 +77,7 @@ struct mvneta_priv {
uint16_t nb_rx_queues;
 
uint64_t rate_max;
+   struct rte_eth_stats prev_stats;
 };
 
 /** Current log type. */
-- 
2.7.4



Re: [dpdk-dev] [PATCH 03/11] telemetry: add client feature and sockets

2018-08-28 Thread Hunt, David



On 24/8/2018 12:27 AM, Stephen Hemminger wrote:

On Thu, 23 Aug 2018 13:08:05 +0100
Ciara Power  wrote:


This patch introduces clients to the telemetry API.

When a client makes a connection through the initial telemetry
socket, they can send a message through the socket to be
parsed. Register messages are expected through this socket, to
enable clients to register and have a client socket setup for
future communications.

A TAILQ is used to store all clients information. Using this, the
client sockets are polled for messages, which will later be parsed
and dealt with accordingly.

Functionality that make use of the client sockets were introduced
in this patch also, such as writing to client sockets, and sending
error responses.

Signed-off-by: Ciara Power 
Signed-off-by: Brian Archbold 

Rather than using the rather heavyweight jansson library and creating
an additional dependency on an external library; may I recommend reusing
the json_writer library (I wrote) that is part of iproute2 and much
simpler.


https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/tree/lib/json_writer.c


Hi Stephen, Ciara,

I'm about to push another patchset to the mailing list in the next few 
days which
also makes use of Jansson. I'm parsing an incoming JSON string 
containing power
management info. The Jansson package comes pre-installed in many 
operating systems,

although you do indeed need to install libjansson-dev to build against it.
I would certainly like to see the community accept its use.

Regards,
Dave.


Re: [dpdk-dev] [PATCH v2] net/mlx: add meson build support

2018-08-28 Thread Bruce Richardson
Thanks for this, comments inline below.

/Bruce

On Mon, Aug 27, 2018 at 02:42:25PM +0200, Nelio Laranjeiro wrote:
> Mellanox drivers remains un-compiled by default due to third party
> libraries dependencies.  They can be enabled through:
> - enable_driver_mlx{4,5}=true or
> - enable_driver_mlx{4,5}_glue=true
> depending on the needs.

The big reason why we wanted a new build system was to move away from this
sort of static configuration. Instead, detect if the requirements as
present and build the driver if you can.

> 
> To avoid modifying the whole sources and keep the compatibility with
> current build systems (e.g. make), the mlx{4,5}_autoconf.h is still
> generated by invoking DPDK scripts though meson's run_command() instead
> of using has_types, has_members, ... commands.
> 
> Meson will try to find the required external libraries.  When they are
> not installed system wide, they can be provided though CFLAGS, LDFLAGS
> and LD_LIBRARY_PATH environment variables, example (considering
> RDMA-Core is installed in /tmp/rdma-core):
> 
>  # CLFAGS=-I/tmp/rdma-core/build/include \
>LDFLAGS=-L/tmp/rdma-core/build/lib \
>LD_LIBRARY_PATH=/tmp/rdma-core/build/lib \
>meson -Denable_driver_mlx4=true output
> 
>  # CLFAGS=-I/tmp/rdma-core/build/include \
>LDFLAGS=-L/tmp/rdma-core/build/lib \
>LD_LIBRARY_PATH=/tmp/rdma-core/build/lib \
>ninja -C output install

Once the CFLAGS/LDFLAGS are passed to meson, they should not be needed for
ninja. The LD_LIBRARY_PATH might be - I'm not sure about that one! :-)

> 
> Signed-off-by: Nelio Laranjeiro 
> 
> ---
> 
> Changes in v2:
> 
> - dropped patch https://patches.dpdk.org/patch/43897/
> - remove extra_{cflags,ldflags} as already honored by meson through
> environment variables.
> ---
>  drivers/net/meson.build  |   2 +
>  drivers/net/mlx4/meson.build |  94 ++
>  drivers/net/mlx5/meson.build | 545 +++
>  meson_options.txt|   8 +
>  4 files changed, 649 insertions(+)
>  create mode 100644 drivers/net/mlx4/meson.build
>  create mode 100644 drivers/net/mlx5/meson.build
> 
> diff --git a/drivers/net/meson.build b/drivers/net/meson.build
> index 9c28ed4da..c7a2d0e7d 100644
> --- a/drivers/net/meson.build
> +++ b/drivers/net/meson.build
> @@ -18,6 +18,8 @@ drivers = ['af_packet',
>   'ixgbe',
>   'kni',
>   'liquidio',
> + 'mlx4',
> + 'mlx5',
>   'mvpp2',
>   'netvsc',
>   'nfp',
> diff --git a/drivers/net/mlx4/meson.build b/drivers/net/mlx4/meson.build
> new file mode 100644
> index 0..debaca5b6
> --- /dev/null
> +++ b/drivers/net/mlx4/meson.build
> @@ -0,0 +1,94 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright 2018 6WIND S.A.
> +# Copyright 2018 Mellanox Technologies, Ltd
> +
> +# As there is no more configuration file to activate/configure the PMD it 
> will
> +# use some variables here to configure it.
> +pmd_dlopen = get_option('enable_driver_mlx4_glue')
> +build = get_option('enable_driver_mlx4') or pmd_dlopen

As stated above, I believe this should be based upon whether you find the
"mnl", "mlx4" and "ibverbs" libraries. If we start adding back in static
options for every driver, then we'll be back to having a mass of config
options like we had before.

> +# dpdk_conf.set('RTE_LIBRTE_MLX4_DEBUG', 1)
> +# Glue configuratin
> +LIB_GLUE_BASE = 'librte_pmd_mlx4_glue.so'
> +LIB_GLUE_VERSION = '18.02.0'
> +LIB_GLUE = LIB_GLUE_BASE + '.' + LIB_GLUE_VERSION
> +if pmd_dlopen
> +dpdk_conf.set('RTE_LIBRTE_MLX4_DLOPEN_DEPS', 1)
> +cflags += [
> +'-DMLX4_GLUE="@0@"'.format(LIB_GLUE),
> +'-DMLX4_GLUE_VERSION="@0@"'.format(LIB_GLUE_VERSION),
> +'-ldl',
> +]

Is the '-ldl' flag necessary, given that for Linux it is added as a
standard linker flag in config/meson.build? If it is needed, does it get
passed through to the pkgconfig file etc. appropriately.

> +endif
> +# Compile PMD
> +if build

If you make the build value depend on the results of the find_library
calls, you can remove this conditional and de-dent the rest of the code,
since the assigned values will be ignored at the higher level.

> +allow_experimental_apis = true
> +ext_deps += [
> +cc.find_library('mnl'),
> +cc.find_library('mlx4'),
> +cc.find_library('ibverbs'),
> +]
> +sources = files(
> +   'mlx4.c',
> +   'mlx4_ethdev.c',
> +   'mlx4_flow.c',
> +   'mlx4_intr.c',
> +   'mlx4_mr.c',
> +   'mlx4_rxq.c',
> +   'mlx4_rxtx.c',
> +   'mlx4_txq.c',
> +   'mlx4_utils.c',
> +)
> +if not pmd_dlopen
> +sources += files('mlx4_glue.c')
> +endif
> +cflags += [
> +   '-O3',
> +   '-Wall',
> +   '-Wextra',
> +   '-g',

Please don't add these flags into y

Re: [dpdk-dev] IXGBE throughput loss with 4+ cores

2018-08-28 Thread Stephen Hemminger
On Tue, 28 Aug 2018 17:34:27 +0430
Saber Rezvani  wrote:

> Hi,
> 
> 
> I have run multi_process/symmetric_mp example in DPDK example directory.
> For a one process its throughput is line rate but as I increase the
> number of cores I see decrease in throughput. For example, If the number
> of queues set to 4 and each queue assigns to a single core, then the
> throughput will be something about 9.4. if 8 queues, then throughput
> will be 8.5.
> 
> I have read the following, but it was not convincing.
> 
> http://mails.dpdk.org/archives/dev/2015-October/024960.html
> 
> 
> I am eagerly looking forward to hearing from you, all.
> 
> 
> Best wishes,
> 
> Saber
> 
> 

Not completely surprising. If you have more cores than packet line rate
then the number of packets returned for each call to rx_burst will be less.
With large number of cores, most of the time will be spent doing reads of
PCI registers for no packets!


Re: [dpdk-dev] [PATCH 02/11] telemetry: add initial connection socket

2018-08-28 Thread Gaëtan Rivet
Hi Ciara,

On Thu, Aug 23, 2018 at 01:08:04PM +0100, Ciara Power wrote:
> This patch adds the telemetry UNIX socket. It is used to
> allow connections from external clients.
> 
> On the initial connection from a client, ethdev stats are
> registered in the metrics library, to allow for their retrieval
> at a later stage.
> 
> Signed-off-by: Ciara Power 
> Signed-off-by: Brian Archbold 
> ---
>  lib/librte_telemetry/rte_telemetry.c  | 205 
> ++
>  lib/librte_telemetry/rte_telemetry_internal.h |   4 +
>  2 files changed, 209 insertions(+)
> 
> diff --git a/lib/librte_telemetry/rte_telemetry.c 
> b/lib/librte_telemetry/rte_telemetry.c
> index 8d7b0e3..f984929 100644
> --- a/lib/librte_telemetry/rte_telemetry.c
> +++ b/lib/librte_telemetry/rte_telemetry.c
> @@ -3,21 +3,159 @@
>   */
>  
>  #include 
> +#include 
> +#include 
> +#include 
>  
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "rte_telemetry.h"
>  #include "rte_telemetry_internal.h"
>  
>  #define SLEEP_TIME 10
>  
> +#define DEFAULT_DPDK_PATH "/var/run/.rte_telemetry"
> +
> +const char *socket_path = DEFAULT_DPDK_PATH;
>  static telemetry_impl *static_telemetry;
>  
> +int32_t
> +rte_telemetry_check_port_activity(int port_id)
> +{
> + int pid;
> +
> + RTE_ETH_FOREACH_DEV(pid) {
> + if (pid == port_id)
> + return 1;
> + }
> + TELEMETRY_LOG_ERR("Error - port_id: %d is invalid, not active\n",
> + port_id);
> + return 0;
> +}
> +

This function seems more about ethdev than telemetry.
Maybe add it as part of librte_ethdev.

The "active" semantic is blurry however. With this implementation, this
is checking that the device is currently attached and owned by the
default user. Should telemetry be limited to devices owned by default
user?

Also, this function does not seem used in this patch, can it be added
when used?

> +static int32_t
> +rte_telemetry_reg_ethdev_to_metrics(uint16_t port_id)

"_to" might not be necessary.

> +{
> + int ret, num_xstats, start_index, i;
> + struct rte_eth_xstat *eth_xstats;
> +
> + if (!rte_eth_dev_is_valid_port(port_id)) {
> + TELEMETRY_LOG_ERR("Error - port_id: %d is invalid\n", port_id);
> + return -EINVAL;
> + }
> +
> + num_xstats = rte_eth_xstats_get(port_id, NULL, 0);
> + if (num_xstats < 0) {
> + TELEMETRY_LOG_ERR("Error - rte_eth_xstats_get(%u) failed:"
> + " %d\n", port_id, num_xstats);

I guess there isn't really a consensus yet, as the checkpatch.sh tool
might be misconfigured, but the cesura is very awkward here.

Same for other logs.

> + return -EPERM;
> + }
> +
> + eth_xstats = malloc(sizeof(struct rte_eth_xstat) * num_xstats);
> + if (eth_xstats == NULL) {
> + TELEMETRY_LOG_ERR("Error - Failed to malloc memory for"
> + " xstats\n");
> + return -ENOMEM;
> + }
> + ret = rte_eth_xstats_get(port_id, eth_xstats, num_xstats);
> + if (ret < 0 || ret > num_xstats) {
> + free(eth_xstats);
> + TELEMETRY_LOG_ERR("Error - rte_eth_xstats_get(%u) len%i"
> + " failed: %d\n", port_id, num_xstats, ret);
> + return -EPERM;
> + }
> + struct rte_eth_xstat_name *eth_xstats_names;
> + eth_xstats_names = malloc(sizeof(struct rte_eth_xstat_name) *
> +  num_xstats);
> + if (eth_xstats_names == NULL) {

You are sometimes checking pointers against NULL, sometimes using "!".
You can choose either in your library, but it would be better to be
consistent and use a unified coding style.

> + free(eth_xstats);
> + TELEMETRY_LOG_ERR("Error - Failed to malloc memory for"
> + " xstats_names\n");
> + return -ENOMEM;
> + }
> + ret = rte_eth_xstats_get_names(port_id, eth_xstats_names,
> +  num_xstats);
> + if (ret < 0 || ret > num_xstats) {
> + free(eth_xstats);
> + free(eth_xstats_names);
> + TELEMETRY_LOG_ERR("Error - rte_eth_xstats_get_names(%u)"
> + " len%i failed: %d\n", port_id, num_xstats,
> +  ret);
> + return -EPERM;
> + }
> + const char *xstats_names[num_xstats];
> +
> + for (i = 0; i < num_xstats; i++)
> + xstats_names[i] = eth_xstats_names[eth_xstats[i].id].name;
> +
> + start_index = rte_metrics_reg_names(xstats_names, num_xstats);
> +
> + if (start_index < 0) {
> + TELEMETRY_LOG_ERR("Error - rte_metrics_reg_names failed -"
> + " metrics may already be registered\n");
> + free(eth_xstats);
> + free(eth_xstats_names);
> + return -1;
> + }
> + free(eth_xstats_names);
> + free(eth_xstats);

At this point, you have repeated 3 times those frees().
It would be cleaner to define proper labels and use go

Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry infrastructure

2018-08-28 Thread Van Haaren, Harry
> From: Shreyansh Jain [mailto:shreyansh.j...@nxp.com]
> Sent: Friday, August 24, 2018 2:04 PM
> To: Power, Ciara ; Van Haaren, Harry
> ; Archbold, Brian ;
> Kenny, Emma 
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry
> infrastructure
> 
> On Thursday 23 August 2018 05:38 PM, Ciara Power wrote:
> > This patch adds the infrastructure and initial code for the
> > telemetry library.
> >
> > A virtual device is used for telemetry, which is included in
> > this patch. To use telemetry, a command-line argument must be
> > used - "--vdev=telemetry".
> >
> > Control threads are used to get CPU cycles for telemetry, which
> > are configured in this patch also.
> >
> > Signed-off-by: Ciara Power 
> > Signed-off-by: Brian Archbold 
> > ---
> 
> [...]
> 
> /rte_pmd_telemetry_version.map
> > new file mode 100644
> > index 000..a73e0f2
> > --- /dev/null
> > +++ b/drivers/telemetry/telemetry/rte_pmd_telemetry_version.map
> > @@ -0,0 +1,9 @@
> > +DPDK_18.05 {
> 
>  ^
> I think you want 18.11 here.

Yes indeed.


> > +   global:
> > +
> > +   telemetry_probe;
> > +   telemetry_remove;
> > +
> > +   local: *;
> > +
> > +};
> 
> 
> [...]
> 
> > diff --git a/lib/Makefile b/lib/Makefile
> > index afa604e..8cbd035 100644
> > --- a/lib/Makefile
> > +++ b/lib/Makefile
> > @@ -105,6 +105,8 @@ DEPDIRS-librte_gso := librte_eal librte_mbuf
> librte_ethdev librte_net
> >   DEPDIRS-librte_gso += librte_mempool
> >   DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
> >   DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf
> librte_ethdev
> > +DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
> > +DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
> >
> >   ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
> >   DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
> > diff --git a/lib/librte_telemetry/Makefile b/lib/librte_telemetry/Makefile
> > new file mode 100644
> > index 000..bda3788
> > --- /dev/null
> > +++ b/lib/librte_telemetry/Makefile
> > @@ -0,0 +1,26 @@
> > +# SPDX-License-Identifier: BSD-3-Clause
> > +# Copyright(c) 2018 Intel Corporation
> > +
> > +include $(RTE_SDK)/mk/rte.vars.mk
> > +
> > +# library name
> > +LIB = librte_telemetry.a
> > +
> > +CFLAGS += -O3
> > +CFLAGS += -I$(SRCDIR)
> > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > +
> > +LDLIBS += -lrte_eal -lrte_ethdev
> > +LDLIBS += -lrte_metrics
> > +
> > +EXPORT_MAP := rte_telemetry_version.map
> > +
> > +LIBABIVER := 1
> > +
> > +# library source files
> > +SRCS-$(CONFIG_RTE_LIBRTE_TELEMETRY) := rte_telemetry.c
> > +
> > +# export include files
> > +SYMLINK-$(CONFIG_RTE_LIBRTE_TELEMETRY)-include := rte_telemetry.h
> > +
> > +include $(RTE_SDK)/mk/rte.lib.mk
> > diff --git a/lib/librte_telemetry/meson.build
> b/lib/librte_telemetry/meson.build
> > new file mode 100644
> > index 000..7716076
> > --- /dev/null
> > +++ b/lib/librte_telemetry/meson.build
> > @@ -0,0 +1,7 @@
> > +# SPDX-License-Identifier: BSD-3-Clause
> > +# Copyright(c) 2018 Intel Corporation
> > +
> > +sources = files('rte_telemetry.c')
> > +headers = files('rte_telemetry.h', 'rte_telemetry_internal.h')
> > +deps += ['metrics', 'ethdev']
> > +cflags += '-DALLOW_EXPERIMENTAL_API'
> > diff --git a/lib/librte_telemetry/rte_telemetry.c
> b/lib/librte_telemetry/rte_telemetry.c
> > new file mode 100644
> > index 000..8d7b0e3
> > --- /dev/null
> > +++ b/lib/librte_telemetry/rte_telemetry.c
> > @@ -0,0 +1,108 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation
> > + */
> > +
> > +#include 
> > +
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include "rte_telemetry.h"
> > +#include "rte_telemetry_internal.h"
> > +
> > +#define SLEEP_TIME 10
> > +
> > +static telemetry_impl *static_telemetry;
> > +
> > +static int32_t
> > +rte_telemetry_run(void *userdata)
> > +{
> > +   struct telemetry_impl *telemetry = (struct telemetry_impl *)userdata;
> > +   if (!telemetry) {
> > +   TELEMETRY_LOG_WARN("Warning - TELEMETRY could not be "
> > +   "initialised\n");
> 
> Your 'TELEMETRY_LOG_WARNING' already includes a '\n' in its definition.
> This would add another one. Can you re-check?

Yes, as Stephen noted too. Will fix!

> 
> > +   return -1;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +static void
> > +*rte_telemetry_run_thread_func(void *userdata)
> > +{
> > +   int ret;
> > +   struct telemetry_impl *telemetry = (struct telemetry_impl *)userdata;
> > +
> > +   if (!telemetry) {
> > +   TELEMETRY_LOG_ERR("Error - %s passed a NULL instance\n",
> > +   __func__);
> 
> I might be picky - but this is an internal function spawned using
> rte_ctrl_thread_create which already has a check whether the argument
> (static_telemetry) is NULL or not. So, this is like duplicating that work.
> 
> > +   pthread_exit(0);
> > +   }
> > +
> > +   while (telemetry->thread_status) {
> > +   rte_telemetry_run(telemetry);
> > +   r

Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry infrastructure

2018-08-28 Thread Van Haaren, Harry
Hi Gaetan,

> -Original Message-
> From: Gaëtan Rivet [mailto:gaetan.ri...@6wind.com]
> Sent: Tuesday, August 28, 2018 12:47 PM
> To: Power, Ciara 
> Cc: Van Haaren, Harry ; Archbold, Brian
> ; Kenny, Emma ; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry
> infrastructure
> 
> Hi Ciara,
> 
> On Thu, Aug 23, 2018 at 01:08:03PM +0100, Ciara Power wrote:
> > This patch adds the infrastructure and initial code for the
> > telemetry library.
> >
> > A virtual device is used for telemetry, which is included in
> > this patch. To use telemetry, a command-line argument must be
> > used - "--vdev=telemetry".
> >
> 
> Why use a virtual device?
> 
> It seems that you are only using the device semantic as a way to carry
> around a tag telling the DPDK framework to init your library once it has
> finished its initialization.
> 
> I guess you wanted to avoid having to add the call to rte_telemetry_init
> to all applications. In the absence of a proper EAL option framework,
> you can workaround by adding a --telemetry EAL parameter, setting a flag
> on, and checking this flag from librte_telemetry, within a routine
> declared with RTE_INIT_PRIO.

I suppose that an EAL --flag could work too, it would mean that EAL would
depend on this library. The --vdev trick keeps the library standalone.

I don't have a strong opinion either way. :)


> I only see afterward the selftest being triggered via kvargs. I haven't
> yet looked at the testing done, but if it is only unit test, the "test"
> app would be better suited. If it is integration testing to verify the
> behavior of the library with other PMDs, you probably need specific
> context, thus selftest being insufficient on its own and useless for
> other users.

Correct, self tests are triggered by kvargs. This same model is used
in eg: eventdev PMDs to run selftests, where the tests are pretty complex
and specific to the device under test.

Again, I don't have a strong opinion but I don't see any issue with it
being included in the vdev / telemetry library. We could write a shim
test that the "official" test binary runs the telemetry tests if that is
your concern?


> > Control threads are used to get CPU cycles for telemetry, which
> > are configured in this patch also.
> >
> > Signed-off-by: Ciara Power 
> > Signed-off-by: Brian Archbold 
> 
> Regards,
> --
> Gaëtan Rivet
> 6WIND

Thanks for review, and there's a lightning talk at Userspace so please
do provide input there too :) -Harry


Re: [dpdk-dev] [PATCH v1] testpmd: add new command for show port info

2018-08-28 Thread Stephen Hemminger
On Tue, 28 Aug 2018 15:11:34 +0100
Emma Finn  wrote:

> existing testpmd command "show port info" is too verbose.
> Added a new summary command to print brief information on ports.
> 
> Signed-off-by: Emma Finn 

Looks good.
Maybe it would be better to drop off redundant information or only add
a single header line like:

testpmd> show port summary all
PortMac NameDriver  LinkStatus
0 11:22:33:44:550001:00 ixgbe   10Gbit  Up
...





Re: [dpdk-dev] [PATCH 02/11] telemetry: add initial connection socket

2018-08-28 Thread Van Haaren, Harry
> From: Gaëtan Rivet [mailto:gaetan.ri...@6wind.com]
> Sent: Tuesday, August 28, 2018 5:40 PM
> To: Power, Ciara 
> Cc: Van Haaren, Harry ; Archbold, Brian
> ; Kenny, Emma ; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 02/11] telemetry: add initial connection
> socket



> > +int32_t
> > +rte_telemetry_check_port_activity(int port_id)
> > +{
> > +   int pid;
> > +
> > +   RTE_ETH_FOREACH_DEV(pid) {
> > +   if (pid == port_id)
> > +   return 1;
> > +   }
> > +   TELEMETRY_LOG_ERR("Error - port_id: %d is invalid, not active\n",
> > +   port_id);
> > +   return 0;
> > +}
> > +
> 
> This function seems more about ethdev than telemetry.
> Maybe add it as part of librte_ethdev.

Yep that might be a better place, making it a generic ethdev function.


> The "active" semantic is blurry however. With this implementation, this
> is checking that the device is currently attached and owned by the
> default user. Should telemetry be limited to devices owned by default
> user?

Good question - perhaps one that we can get input on from others.
Would you (app X) want App Y to read telemetry from your ethdev/cryptodev?

Perhaps keeping telemetry to an "owned" port is a feature, perhaps
a limitation.

I'd like input on this one :D


> Also, this function does not seem used in this patch, can it be added
> when used?

Sure.


> > +static int32_t
> > +rte_telemetry_reg_ethdev_to_metrics(uint16_t port_id)
> 
> "_to" might not be necessary.
> 
> > +{
> > +   int ret, num_xstats, start_index, i;
> > +   struct rte_eth_xstat *eth_xstats;
> > +
> > +   if (!rte_eth_dev_is_valid_port(port_id)) {
> > +   TELEMETRY_LOG_ERR("Error - port_id: %d is invalid\n", port_id);
> > +   return -EINVAL;
> > +   }
> > +
> > +   num_xstats = rte_eth_xstats_get(port_id, NULL, 0);
> > +   if (num_xstats < 0) {
> > +   TELEMETRY_LOG_ERR("Error - rte_eth_xstats_get(%u) failed:"
> > +   " %d\n", port_id, num_xstats);
> 
> I guess there isn't really a consensus yet, as the checkpatch.sh tool
> might be misconfigured, but the cesura is very awkward here.
> 
> Same for other logs.

Agreed, improvements possible here.


> > +   return -EPERM;
> > +   }
> > +
> > +   eth_xstats = malloc(sizeof(struct rte_eth_xstat) * num_xstats);
> > +   if (eth_xstats == NULL) {
> > +   TELEMETRY_LOG_ERR("Error - Failed to malloc memory for"
> > +   " xstats\n");
> > +   return -ENOMEM;
> > +   }
> > +   ret = rte_eth_xstats_get(port_id, eth_xstats, num_xstats);
> > +   if (ret < 0 || ret > num_xstats) {
> > +   free(eth_xstats);
> > +   TELEMETRY_LOG_ERR("Error - rte_eth_xstats_get(%u) len%i"
> > +   " failed: %d\n", port_id, num_xstats, ret);
> > +   return -EPERM;
> > +   }
> > +   struct rte_eth_xstat_name *eth_xstats_names;
> > +   eth_xstats_names = malloc(sizeof(struct rte_eth_xstat_name) *
> > +num_xstats);
> > +   if (eth_xstats_names == NULL) {
> 
> You are sometimes checking pointers against NULL, sometimes using "!".
> You can choose either in your library, but it would be better to be
> consistent and use a unified coding style.

Heh, I guess that's down to dev-style, and you'll note multiple sign-offs ;)

Can review for v2.


> > +   free(eth_xstats);
> > +   TELEMETRY_LOG_ERR("Error - Failed to malloc memory for"
> > +   " xstats_names\n");
> > +   return -ENOMEM;
> > +   }
> > +   ret = rte_eth_xstats_get_names(port_id, eth_xstats_names,
> > +num_xstats);
> > +   if (ret < 0 || ret > num_xstats) {
> > +   free(eth_xstats);
> > +   free(eth_xstats_names);
> > +   TELEMETRY_LOG_ERR("Error - rte_eth_xstats_get_names(%u)"
> > +   " len%i failed: %d\n", port_id, num_xstats,
> > +ret);
> > +   return -EPERM;
> > +   }
> > +   const char *xstats_names[num_xstats];
> > +
> > +   for (i = 0; i < num_xstats; i++)
> > +   xstats_names[i] = eth_xstats_names[eth_xstats[i].id].name;
> > +
> > +   start_index = rte_metrics_reg_names(xstats_names, num_xstats);
> > +
> > +   if (start_index < 0) {
> > +   TELEMETRY_LOG_ERR("Error - rte_metrics_reg_names failed -"
> > +   " metrics may already be registered\n");
> > +   free(eth_xstats);
> > +   free(eth_xstats_names);
> > +   return -1;
> > +   }
> > +   free(eth_xstats_names);
> > +   free(eth_xstats);
> 
> At this point, you have repeated 3 times those frees().
> It would be cleaner to define proper labels and use goto instead.

Agreed.

Thanks for review!



Re: [dpdk-dev] IXGBE throughput loss with 4+ cores

2018-08-28 Thread Saber Rezvani




On 08/28/2018 08:31 PM, Stephen Hemminger wrote:

On Tue, 28 Aug 2018 17:34:27 +0430
Saber Rezvani  wrote:


Hi,


I have run multi_process/symmetric_mp example in DPDK example directory.
For a one process its throughput is line rate but as I increase the
number of cores I see decrease in throughput. For example, If the number
of queues set to 4 and each queue assigns to a single core, then the
throughput will be something about 9.4. if 8 queues, then throughput
will be 8.5.

I have read the following, but it was not convincing.

http://mails.dpdk.org/archives/dev/2015-October/024960.html


I am eagerly looking forward to hearing from you, all.


Best wishes,

Saber



Not completely surprising. If you have more cores than packet line rate
then the number of packets returned for each call to rx_burst will be less.
With large number of cores, most of the time will be spent doing reads of
PCI registers for no packets!
Indeed pktgen says it is generating traffic at line rate, but receiving 
less than 10 Gb/s. So, it that case there should be something that 
causes the reduction in throughput :(





Re: [dpdk-dev] [PATCH 03/11] telemetry: add client feature and sockets

2018-08-28 Thread Van Haaren, Harry
> -Original Message-
> From: Hunt, David
> Sent: Tuesday, August 28, 2018 4:27 PM
> To: Stephen Hemminger ; Power, Ciara
> 
> Cc: Van Haaren, Harry ; Archbold, Brian
> ; Kenny, Emma ; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 03/11] telemetry: add client feature and
> sockets
> 
> 
> On 24/8/2018 12:27 AM, Stephen Hemminger wrote:
> > On Thu, 23 Aug 2018 13:08:05 +0100
> > Ciara Power  wrote:
> >
> >> This patch introduces clients to the telemetry API.
> >>
> >> When a client makes a connection through the initial telemetry
> >> socket, they can send a message through the socket to be
> >> parsed. Register messages are expected through this socket, to
> >> enable clients to register and have a client socket setup for
> >> future communications.
> >>
> >> A TAILQ is used to store all clients information. Using this, the
> >> client sockets are polled for messages, which will later be parsed
> >> and dealt with accordingly.
> >>
> >> Functionality that make use of the client sockets were introduced
> >> in this patch also, such as writing to client sockets, and sending
> >> error responses.
> >>
> >> Signed-off-by: Ciara Power 
> >> Signed-off-by: Brian Archbold 
> > Rather than using the rather heavyweight jansson library and creating
> > an additional dependency on an external library; may I recommend reusing
> > the json_writer library (I wrote) that is part of iproute2 and much
> > simpler.
> >
> >
> https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/tree/lib/json_w
> riter.c

Although I tend to prefer lightweight or dependency-free software, in this
case I think the requirements (parsing and writing JSON) is more than we
should implement our own version of in DPDK.

Jansson provides all we need, and as Dave notes below it is a common
library with good packaging support already.

With the Meson build system, I'd like to see it being picked up dynamically
and compiling in automatically when available.

For the existing Make system, I'm OK with Telemetry being off by default
and users switching it on if they wish to accept Jansson dep and gain the
telemetry functionality.


> I'm about to push another patchset to the mailing list in the next few
> days which
> also makes use of Jansson. I'm parsing an incoming JSON string
> containing power
> management info. The Jansson package comes pre-installed in many
> operating systems,
> although you do indeed need to install libjansson-dev to build against it.
> I would certainly like to see the community accept its use.




Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry infrastructure

2018-08-28 Thread Van Haaren, Harry
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Friday, August 24, 2018 12:22 AM
> To: Power, Ciara 
> Cc: Van Haaren, Harry ; Archbold, Brian
> ; Kenny, Emma ; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 01/11] telemetry: initial telemetry
> infrastructure
> 
> On Thu, 23 Aug 2018 13:08:03 +0100
> Ciara Power  wrote:
> 
> > +/* Logging Macros */
> > +extern int telemetry_log_level;
> > +
> > +#define TELEMETRY_LOG(level, fmt, args...) \
> > +   rte_log(RTE_LOG_ ##level, telemetry_log_level, "%s(): "fmt "\n", \
> > +   __func__, ##args)
> > +
> > +#define TELEMETRY_LOG_ERR(fmt, args...) \
> > +   TELEMETRY_LOG(ERR, fmt, ## args)
> > +
> > +#define TELEMETRY_LOG_WARN(fmt, args...) \
> > +   TELEMETRY_LOG(WARNING, fmt, ## args)
> > +
> > +#define TELEMETRY_LOG_INFO(fmt, args...) \
> > +   TELEMETRY_LOG(INFO, fmt, ## args)
> > +
> > +typedef struct telemetry_impl {
> > +   pthread_t thread_id;
> > +   int thread_status;
> > +   uint32_t socket_id;
> > +} telemetry_impl;
> > +
> 
> Your logging macros follow the standard DPDK style. Including automatically
> adding a new line. But as I look at the code, many of the TELEMETRY_LOG
> calls
> have a newline in the format. Therefore your log messages will be double
> spaced.

Correct, will get that fixed.

Thanks for reviewing! Looking forward to Userspace, curious to hear of
your use-cases for Telemetry lib, assuming you have some in mind.


Re: [dpdk-dev] [PATCH] net/mlx5: fix interrupt completion queue index wrapping

2018-08-28 Thread Yongseok Koh
> On Aug 23, 2018, at 4:10 PM, Xueming Li  wrote:
> 
> Rxq cq_ci was 16 bits while hardware is expecting to wrap
> around 24 bits, this caused interrupt failure after burst of packets.
> 
> Fixes: 43e9d9794cde ("net/mlx5: support upstream rdma-core")
> Cc: Shahaf Shuler 
> 
> Signed-off-by: Xueming Li 
> ---
Acked-by: Yongseok Koh 
 
Thanks


Re: [dpdk-dev] IXGBE throughput loss with 4+ cores

2018-08-28 Thread Wiles, Keith
Which version of Pktgen? I just pushed a patch in 3.5.3 to fix a  performance 
problem.

> On Aug 28, 2018, at 12:05 PM, Saber Rezvani  wrote:
> 
> 
> 
> On 08/28/2018 08:31 PM, Stephen Hemminger wrote:
>> On Tue, 28 Aug 2018 17:34:27 +0430
>> Saber Rezvani  wrote:
>> 
>>> Hi,
>>> 
>>> 
>>> I have run multi_process/symmetric_mp example in DPDK example directory.
>>> For a one process its throughput is line rate but as I increase the
>>> number of cores I see decrease in throughput. For example, If the number
>>> of queues set to 4 and each queue assigns to a single core, then the
>>> throughput will be something about 9.4. if 8 queues, then throughput
>>> will be 8.5.
>>> 
>>> I have read the following, but it was not convincing.
>>> 
>>> http://mails.dpdk.org/archives/dev/2015-October/024960.html
>>> 
>>> 
>>> I am eagerly looking forward to hearing from you, all.
>>> 
>>> 
>>> Best wishes,
>>> 
>>> Saber
>>> 
>>> 
>> Not completely surprising. If you have more cores than packet line rate
>> then the number of packets returned for each call to rx_burst will be less.
>> With large number of cores, most of the time will be spent doing reads of
>> PCI registers for no packets!
> Indeed pktgen says it is generating traffic at line rate, but receiving less 
> than 10 Gb/s. So, it that case there should be something that causes the 
> reduction in throughput :(
> 
> 

Regards,
Keith



Re: [dpdk-dev] [RFC] ethdev: support metadata as flow rule criteria

2018-08-28 Thread Yongseok Koh
> On Aug 24, 2018, at 3:11 AM, Ananyev, Konstantin 
>  wrote:
> 
> 
> 
>> -Original Message-
>> From: Yongseok Koh [mailto:ys...@mellanox.com]
>> Sent: Thursday, August 23, 2018 10:32 PM
>> To: Andrew Rybchenko 
>> Cc: Dekel Peled ; dev@dpdk.org; or...@mellanox.com; 
>> shah...@mellanox.com; Thomas Monjalon
>> ; Ananyev, Konstantin ; 
>> Yigit, Ferruh ; Adrien
>> Mazarguil ; Olivier Matz 
>> Subject: Re: [dpdk-dev] [RFC] ethdev: support metadata as flow rule criteria
>> 
>> On Wed, Aug 22, 2018 at 04:31:14PM +0300, Andrew Rybchenko wrote:
>>> On 13.08.2018 10:46, Dekel Peled wrote:
 Current implementation of rte_flow allows match pattern of flow rule,
 based on packet data or header fields.
 This limits the application use of match patterns.
 
 For example, consider a vswitch application which controls a set of VMs,
 connected with virtio, in a fabric with overlay of VXLAN.
 Several VMs can have the same inner tuple, while the outer tuple is
 different and controlled by the vswitch (encap action).
 For the vswtich to be able to offload the rule to the NIC, it must use a
 unique match criteria, independent from the inner tuple, to perform the
 encap action.
 
 This RFC adds support for additional metadata to use as match pattern.
 The metadata is an opaque item, fully controlled by the application.
 
 The use of metadata is relevant for egress rules only.
 It can be set in the flow rule using the RTE_FLOW_ITEM_META.
 
 Application should set the packet metdata in the mbuf->metadata field,
 and set the PKT_TX_METADATA flag in the mbuf->ol_flags.
 The NIC will use the packet metadata as match criteria for relevant flow
 rules.
 
 For example, to do an encap action depending on the VM id, the
 application needs to configure 'match on metadata' rte_flow rule with
 VM id as metadata, along with desired encap action.
 When preparing an egress data packet, application will set VM id data in
 mbuf metadata field and set PKT_TX_METADATA flag.
 
 PMD will send data packets to NIC, with VM id as metadata.
 Egress flow on NIC will match metadata as done with other criteria.
 Upon match on metadata (VM id) the appropriate encap action will be
 performed.
 
 This RFC introduces metadata item type for rte_flow RTE_FLOW_ITEM_META,
 along with corresponding struct rte_flow_item_meta and ol_flag
 PKT_TX_METADATA.
 It also enhances struct rte_mbuf with new data item, uint64_t metadata.
 
 Comments are welcome.
 
 Signed-off-by: Dekel Peled 
 ---
  doc/guides/prog_guide/rte_flow.rst | 21 +
  lib/librte_ethdev/rte_flow.c   |  1 +
  lib/librte_ethdev/rte_flow.h   | 25 +
  lib/librte_mbuf/rte_mbuf.h | 11 +++
  4 files changed, 58 insertions(+)
 
 diff --git a/doc/guides/prog_guide/rte_flow.rst 
 b/doc/guides/prog_guide/rte_flow.rst
 index b305a72..b6e35f1 100644
 --- a/doc/guides/prog_guide/rte_flow.rst
 +++ b/doc/guides/prog_guide/rte_flow.rst
 @@ -1191,6 +1191,27 @@ Normally preceded by any of:
  - `Item: ICMP6_ND_NS`_
  - `Item: ICMP6_ND_OPT`_
 +Item: ``META``
 +^^
 +
 +Matches an application specific 64 bit metadata item.
 +
 +- Default ``mask`` matches any 64 bit value.
 +
 +.. _table_rte_flow_item_meta:
 +
 +.. table:: META
 +
 +   +--+--+---+
 +   | Field| Subfield | Value |
 +   +==+==+===+
 +   | ``spec`` | ``data`` | 64 bit metadata value |
 +   +--+--+
 +   | ``last`` | ``data`` | upper range value |
 +   +--+--+---+
 +   | ``mask`` | ``data`` | zeroed to match any value |
 +   +--+--+---+
 +
  Actions
  ~~~
 diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c
 index cff4b52..54e5ef8 100644
 --- a/lib/librte_ethdev/rte_flow.c
 +++ b/lib/librte_ethdev/rte_flow.c
 @@ -66,6 +66,7 @@ struct rte_flow_desc_data {
 sizeof(struct rte_flow_item_icmp6_nd_opt_sla_eth)),
MK_FLOW_ITEM(ICMP6_ND_OPT_TLA_ETH,
 sizeof(struct rte_flow_item_icmp6_nd_opt_tla_eth)),
 +  MK_FLOW_ITEM(META, sizeof(struct rte_flow_item_meta)),
  };
  /** Generate flow_action[] entry. */
 diff --git a/lib/librte_ethdev/rte_flow.h b/lib/librte_ethdev/rte_flow.h
 index f8ba71c..b81c816 100644
 --- a/lib/librte_ethdev/rte_flow.h
 +++ b/lib/librte_ethdev/rte_flow.h
 @@ -413,6 +413,15 @@ enum rte_flow_item_type {
 * See struct rte_flow_item_mark.
 */
RTE_FLOW_ITEM_TYPE_MARK,
 +
>

Re: [dpdk-dev] IXGBE throughput loss with 4+ cores

2018-08-28 Thread Saber Rezvani




On 08/28/2018 11:39 PM, Wiles, Keith wrote:

Which version of Pktgen? I just pushed a patch in 3.5.3 to fix a  performance 
problem.
I use Pktgen verion 3.0.0, indeed it is O.k as far as I  have one core. 
(10 Gb/s) but when I increase the number of core (one core per queue) 
then I loose some performance (roughly 8.5 Gb/s for 8-core). In my 
scenario Pktgen shows it is generating at line rate, but receiving 8.5 Gb/s.

Is it because of Pktgen???



On Aug 28, 2018, at 12:05 PM, Saber Rezvani  wrote:



On 08/28/2018 08:31 PM, Stephen Hemminger wrote:

On Tue, 28 Aug 2018 17:34:27 +0430
Saber Rezvani  wrote:


Hi,


I have run multi_process/symmetric_mp example in DPDK example directory.
For a one process its throughput is line rate but as I increase the
number of cores I see decrease in throughput. For example, If the number
of queues set to 4 and each queue assigns to a single core, then the
throughput will be something about 9.4. if 8 queues, then throughput
will be 8.5.

I have read the following, but it was not convincing.

http://mails.dpdk.org/archives/dev/2015-October/024960.html


I am eagerly looking forward to hearing from you, all.


Best wishes,

Saber



Not completely surprising. If you have more cores than packet line rate
then the number of packets returned for each call to rx_burst will be less.
With large number of cores, most of the time will be spent doing reads of
PCI registers for no packets!

Indeed pktgen says it is generating traffic at line rate, but receiving less 
than 10 Gb/s. So, it that case there should be something that causes the 
reduction in throughput :(



Regards,
Keith







Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

2018-08-28 Thread Gavin Hu
Assuming reader and writer may execute on different CPU's, this become standard 
multithreaded programming.
We are concerned about that update the reader pointer too early(weak ordering 
may reorder it before reading from the slots), that means the slots are 
released and may immediately overwritten by the writer then you get "too new" 
data and get lost of the old data.

From: Kokkilagadda, Kiran 
Sent: Tuesday, August 28, 2018 6:44 PM
To: Gavin Hu ; Ferruh Yigit ; Jacob, 
Jerin 
Cc: dev@dpdk.org; Honnappa Nagarahalli 
Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization


In this instance there won't be any problem, as until the value of fifo->write 
changes, this loop won't get executed. As of now we didn't see any issue with 
it and for performance reasons, we don't want to keep read barrier.




From: Gavin Hu mailto:gavin...@arm.com>>
Sent: Monday, August 27, 2018 9:10 PM
To: Ferruh Yigit; Kokkilagadda, Kiran; Jacob, Jerin
Cc: dev@dpdk.org; Honnappa Nagarahalli
Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

External Email

This fix is not complete, kni_fifo_get requires a read fence also, otherwise it 
probably gets stale data on a weak ordering platform.

> -Original Message-
> From: dev mailto:dev-boun...@dpdk.org>> On Behalf Of 
> Ferruh Yigit
> Sent: Monday, August 27, 2018 10:08 PM
> To: Kiran Kumar 
> mailto:kkokkilaga...@caviumnetworks.com>>;
> jerin.ja...@caviumnetworks.com
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer
> synchronization
>
> On 8/16/2018 10:55 AM, Kiran Kumar wrote:
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. So adding a write
> > barrier to make sure the values being synced before updating fifo_write.
> >
> > Fixes: 3fc5ca2f6352 ("kni: initial import")
> >
> > Signed-off-by: Kiran Kumar 
> > mailto:kkokkilaga...@caviumnetworks.com>>
> > Acked-by: Jerin Jacob 
> > mailto:jerin.ja...@caviumnetworks.com>>
>
> Acked-by: Ferruh Yigit mailto:ferruh.yi...@intel.com>>
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.


Re: [dpdk-dev] [RFC v2] ethdev: support metadata as flow rule criteria

2018-08-28 Thread Yongseok Koh
> On Aug 26, 2018, at 7:09 AM, Dekel Peled  wrote:
> 
> Current implementation of rte_flow allows match pattern of flow rule,
> based on packet data or header fields.
> This limits the application use of match patterns.
> 
> For example, consider a vswitch application which controls a set of VMs,
> connected with virtio, in a fabric with overlay of VXLAN.
> Several VMs can have the same inner tuple, while the outer tuple is
> different and controlled by the vswitch (encap action).
> For the vswtich to be able to offload the rule to the NIC, it must use a
> unique match criteria, independent from the inner tuple, to perform the
> encap action.
> 
> This RFC adds support for additional metadata to use as match pattern.
> The metadata is an opaque item, fully controlled by the application.
> 
> The use of metadata is relevant for egress rules only.
> It can be set in the flow rule using the RTE_FLOW_ITEM_META.
> 
> In order to avoid change in mbuf API, exisitng field mbuf.hash.fdir.hi
> will be used to carry the metadata item. This field is used only in
> ingress packets, so using it for egress metadata will not cause conflicts.
> 
> Application should set the packet metdata in the mbuf dedicated field,
> and set the PKT_TX_METADATA flag in the mbuf->ol_flags.
> The NIC will use the packet metadata as match criteria for relevant flow
> rules.
> 
> For example, to do an encap action depending on the VM id, the
> application needs to configure 'match on metadata' rte_flow rule with
> VM id as metadata, along with desired encap action.
> When preparing an egress data packet, application will set VM id data in
> mbuf dedicated field, and set PKT_TX_METADATA flag.
> 
> PMD will send data packets to NIC, with VM id as metadata.
> Egress flow on NIC will match metadata as done with other criteria.
> Upon match on metadata (VM id) the appropriate encap action will be
> performed.
> 
> This RFC introduces metadata item type for rte_flow RTE_FLOW_ITEM_META,
> along with corresponding struct rte_flow_item_meta and ol_flag
> PKT_TX_METADATA.
> 
> Comments are welcome.
> 
> Signed-off-by: Dekel Peled 
> ---
> v2: Use existing field in mbuf for metadata item, as suggested, instead 
>of adding a new field.
>Metadata item size adjusted to 32 bits.
> ---
> doc/guides/prog_guide/rte_flow.rst | 21 +
> lib/librte_ethdev/rte_flow.c   |  1 +
> lib/librte_ethdev/rte_flow.h   | 25 +
> lib/librte_mbuf/rte_mbuf.h | 13 +
> 4 files changed, 60 insertions(+)
> 
> diff --git a/doc/guides/prog_guide/rte_flow.rst 
> b/doc/guides/prog_guide/rte_flow.rst
> index b305a72..560e45a 100644
> --- a/doc/guides/prog_guide/rte_flow.rst
> +++ b/doc/guides/prog_guide/rte_flow.rst
> @@ -1191,6 +1191,27 @@ Normally preceded by any of:
> - `Item: ICMP6_ND_NS`_
> - `Item: ICMP6_ND_OPT`_
> 
> +Item: ``META``
> +^^
> +
> +Matches an application specific 32 bit metadata item.
> +
> +- Default ``mask`` matches any 32 bit value.
> +
> +.. _table_rte_flow_item_meta:
> +
> +.. table:: META
> +
> +   +--+--+---+
> +   | Field| Subfield | Value |
> +   +==+==+===+
> +   | ``spec`` | ``data`` | 32 bit metadata value |
> +   +--+--+
> +   | ``last`` | ``data`` | upper range value |
> +   +--+--+---+
> +   | ``mask`` | ``data`` | zeroed to match any value |
> +   +--+--+---+
> +
> Actions
> ~~~
> 
> diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c
> index cff4b52..54e5ef8 100644
> --- a/lib/librte_ethdev/rte_flow.c
> +++ b/lib/librte_ethdev/rte_flow.c
> @@ -66,6 +66,7 @@ struct rte_flow_desc_data {
>sizeof(struct rte_flow_item_icmp6_nd_opt_sla_eth)),
>   MK_FLOW_ITEM(ICMP6_ND_OPT_TLA_ETH,
>sizeof(struct rte_flow_item_icmp6_nd_opt_tla_eth)),
> + MK_FLOW_ITEM(META, sizeof(struct rte_flow_item_meta)),
> };
> 
> /** Generate flow_action[] entry. */
> diff --git a/lib/librte_ethdev/rte_flow.h b/lib/librte_ethdev/rte_flow.h
> index f8ba71c..eba3cc4 100644
> --- a/lib/librte_ethdev/rte_flow.h
> +++ b/lib/librte_ethdev/rte_flow.h
> @@ -413,6 +413,15 @@ enum rte_flow_item_type {
>* See struct rte_flow_item_mark.
>*/
>   RTE_FLOW_ITEM_TYPE_MARK,
> +
> + /**
> +  * [META]
> +  *
> +  * Matches a metadata value specified in mbuf metadata field.
> +  *
> +  * See struct rte_flow_item_meta.
> +  */
> + RTE_FLOW_ITEM_TYPE_META,
> };
> 
> /**
> @@ -849,6 +858,22 @@ struct rte_flow_item_gre {
> #endif
> 
> /**
> + * RTE_FLOW_ITEM_TYPE_META.
> + *
> + * Matches a specified metadata value.
> + */
> +struct rte_flow_item_meta {
> + uint32_t data;
> +};
> +
> +/** Default mask for RTE_FLOW_ITEM_TYPE_META. */
> +#ifndef __cplusplus
> 

Re: [dpdk-dev] [PATCH] net/mlx5: handle expected errno properly

2018-08-28 Thread Yongseok Koh
On Fri, Aug 24, 2018 at 02:45:00PM +0800, Jack MIN wrote:
> On Thu, Aug 23, 2018 at 02:08:09PM -0700, Yongseok Koh wrote:
> > On Thu, Aug 23, 2018 at 02:38:51PM +0800, Xiaoyu Min wrote:
> > > rte_errno is a per thread variable and is widely used as an
> > > error indicator, which means a function could affect
> > > other functions' results by setting rte_errno carelessly
> > > 
> > > During rxq setup, an EINVAL rte_errno is expected since
> > > the queues are not created yet
> > > So rte_errno is cleared when it is EINVAL as expected
> > > 
> > > Signed-off-by: Xiaoyu Min 
> > > ---
> > >  drivers/net/mlx5/mlx5_rxq.c | 20 +++-
> > >  1 file changed, 15 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> > > index 1f7bfd4..e7056e8 100644
> > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > @@ -443,6 +443,7 @@
> > >   struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
> > >   struct mlx5_rxq_ctrl *rxq_ctrl =
> > >   container_of(rxq, struct mlx5_rxq_ctrl, rxq);
> > > + int ret = 0;
> > >  
> > >   if (!rte_is_power_of_2(desc)) {
> > >   desc = 1 << log2above(desc);
> > > @@ -459,13 +460,21 @@
> > >   rte_errno = EOVERFLOW;
> > >   return -rte_errno;
> > >   }
> > > - if (!mlx5_rxq_releasable(dev, idx)) {
> > > + ret = mlx5_rxq_releasable(dev, idx);
> > > + if (!ret) {
> > >   DRV_LOG(ERR, "port %u unable to release queue index %u",
> > >   dev->data->port_id, idx);
> > >   rte_errno = EBUSY;
> > >   return -rte_errno;
> > > + } else if (ret == -EINVAL) {
> > > + /**
> > > +  * on the first time, rx queue doesn't exist,
> > > +  * so just ignore this error and reset rte_errno.
> > > +  */
> > > + rte_errno = 0;
> > 
> > Unless this function returns failure, the rte_errno will be ignored by 
> > caller
> > and caller shouldn't assume rte_errno has 0. Caller will assume it is 
> > garbage
> > data in case of success. So we can silently ignore this case. Does it cause 
> > any
> > issue in application side?
> > 
> Not application side but mlx5 PMD this time:
> **mlx5_fdir_filter_delete** 
> which just _return -rte_errno;_

Looks like an error. mlx5_fdir_filter_delete() can't be like that. We seem to
have lost the code while refactoring it. Let take it offline.

> For sure, _mlx5_fdir_filter_delete_ should be more defensive, should not 
> assume
> rte_errno is zero if no error happened.
> However if the caller know that an error will happen and rte_errno will become
> meaningless (garbage) for the succeeding functions, Catching the expected 
> error
> and resetting rte_errno will be better. What do you think?

Still don't understand clearly. There would be many other similar cases where we
don't clear rte_errno when returning success. I don't understand why this case
should be taken as a special one??

Thanks
Yongseok


[dpdk-dev] [PATCH] test/bpf: use hton instead of __builtin_bswap

2018-08-28 Thread Malvika Gupta
Convert host machine endianness to networking endianness for
comparison of incoming packets with BPF filter


Signed-off-by: Malvika Gupta 
Reviewed-by: Gavin Hu 
Reviewed-by: Brian Brooks 
Suggested-by: Brian Brooks 
---
 test/bpf/t1.c | 7 ---
 test/bpf/t3.c | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/bpf/t1.c b/test/bpf/t1.c
index 60f9434ab..7943fcf34 100644
--- a/test/bpf/t1.c
+++ b/test/bpf/t1.c
@@ -28,24 +28,25 @@
 #include 
 #include 
 #include 
+#include 
 
 uint64_t
 entry(void *pkt)
 {
struct ether_header *ether_header = (void *)pkt;
 
-   if (ether_header->ether_type != __builtin_bswap16(0x0800))
+   if (ether_header->ether_type != htons(0x0800))
return 0;
 
struct iphdr *iphdr = (void *)(ether_header + 1);
if (iphdr->protocol != 17 || (iphdr->frag_off & 0x1) != 0 ||
-   iphdr->daddr != __builtin_bswap32(0x1020304))
+   iphdr->daddr != htonl(0x1020304))
return 0;
 
int hlen = iphdr->ihl * 4;
struct udphdr *udphdr = (void *)iphdr + hlen;
 
-   if (udphdr->dest !=  __builtin_bswap16(5000))
+   if (udphdr->dest != htons(5000))
return 0;
 
return 1;
diff --git a/test/bpf/t3.c b/test/bpf/t3.c
index 531b9cb8c..24298b7c7 100644
--- a/test/bpf/t3.c
+++ b/test/bpf/t3.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include "mbuf.h"
+#include 
 
 extern void rte_pktmbuf_dump(FILE *, const struct rte_mbuf *, unsigned int);
 
@@ -29,7 +30,7 @@ entry(const void *pkt)
mb = pkt;
eth = rte_pktmbuf_mtod(mb, const struct ether_header *);
 
-   if (eth->ether_type == __builtin_bswap16(ETHERTYPE_ARP))
+   if (eth->ether_type == htons(ETHERTYPE_ARP))
rte_pktmbuf_dump(stdout, mb, 64);
 
return 1;
-- 
2.17.1



Re: [dpdk-dev] [PATCH] test/bpf: use hton instead of __builtin_bswap

2018-08-28 Thread Honnappa Nagarahalli



-Original Message-
From: Malvika Gupta  
Sent: Tuesday, August 28, 2018 3:46 PM
To: konstantin.anan...@intel.com
Cc: dev@dpdk.org; Gavin Hu ; Honnappa Nagarahalli 
; Brian Brooks ; nd 
; Malvika Gupta ; Malvika Gupta 

Subject: [PATCH] test/bpf: use hton instead of __builtin_bswap

Convert host machine endianness to networking endianness for comparison of 
incoming packets with BPF filter


Signed-off-by: Malvika Gupta 
Reviewed-by: Gavin Hu 
Reviewed-by: Brian Brooks 
Suggested-by: Brian Brooks 
---
 test/bpf/t1.c | 7 ---
 test/bpf/t3.c | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/bpf/t1.c b/test/bpf/t1.c index 60f9434ab..7943fcf34 100644
--- a/test/bpf/t1.c
+++ b/test/bpf/t1.c
@@ -28,24 +28,25 @@
 #include 
 #include 
 #include 
+#include 
 
 uint64_t
 entry(void *pkt)
 {
struct ether_header *ether_header = (void *)pkt;
 
-   if (ether_header->ether_type != __builtin_bswap16(0x0800))
+   if (ether_header->ether_type != htons(0x0800))
return 0;
 
struct iphdr *iphdr = (void *)(ether_header + 1);
if (iphdr->protocol != 17 || (iphdr->frag_off & 0x1) != 0 ||
-   iphdr->daddr != __builtin_bswap32(0x1020304))
+   iphdr->daddr != htonl(0x1020304))
return 0;
 
int hlen = iphdr->ihl * 4;
struct udphdr *udphdr = (void *)iphdr + hlen;
 
-   if (udphdr->dest !=  __builtin_bswap16(5000))
+   if (udphdr->dest != htons(5000))
return 0;
 
return 1;
diff --git a/test/bpf/t3.c b/test/bpf/t3.c index 531b9cb8c..24298b7c7 100644
--- a/test/bpf/t3.c
+++ b/test/bpf/t3.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include "mbuf.h"
+#include 
 
 extern void rte_pktmbuf_dump(FILE *, const struct rte_mbuf *, unsigned int);
 
@@ -29,7 +30,7 @@ entry(const void *pkt)
mb = pkt;
eth = rte_pktmbuf_mtod(mb, const struct ether_header *);
 
-   if (eth->ether_type == __builtin_bswap16(ETHERTYPE_ARP))
+   if (eth->ether_type == htons(ETHERTYPE_ARP))
rte_pktmbuf_dump(stdout, mb, 64);
 
return 1;
--
2.17.1

Compiled and tested.
Acked-by: Honnappa Nagarahalli 



Re: [dpdk-dev] IXGBE throughput loss with 4+ cores

2018-08-28 Thread Wiles, Keith


> On Aug 28, 2018, at 2:16 PM, Saber Rezvani  wrote:
> 
> 
> 
> On 08/28/2018 11:39 PM, Wiles, Keith wrote:
>> Which version of Pktgen? I just pushed a patch in 3.5.3 to fix a  
>> performance problem.
> I use Pktgen verion 3.0.0, indeed it is O.k as far as I  have one core. (10 
> Gb/s) but when I increase the number of core (one core per queue) then I 
> loose some performance (roughly 8.5 Gb/s for 8-core). In my scenario Pktgen 
> shows it is generating at line rate, but receiving 8.5 Gb/s.
> Is it because of Pktgen???

Normally Pktgen can receive at line rate up to 10G 64 byte frames, which means 
Pktgen should not be the problem. You can verify that by looping the cable from 
one port to another on the pktgen machine to create a external loopback. Then 
send traffic what ever you can send from one port you should be able to receive 
those packets unless something is configured wrong.

Please send me the command line for pktgen.


In pktgen if you have this config -m “[1-4:5-8].0” then you have 4 cores 
sending traffic and 4 core receiving packets.

In this case the TX cores will be sending the packets on all 4 lcores to the 
same port. On the rx side you have 4 cores polling 4 rx queues. The rx queues 
are controlled by RSS, which means the RX traffic 5 tuples hash must divide the 
inbound packets across all 4 queues to make sure each core is doing the same 
amount of work. If you are sending only a single packet on the Tx cores then 
only one rx queue be used.

I hope that makes sense.

>> 
>>> On Aug 28, 2018, at 12:05 PM, Saber Rezvani  wrote:
>>> 
>>> 
>>> 
>>> On 08/28/2018 08:31 PM, Stephen Hemminger wrote:
 On Tue, 28 Aug 2018 17:34:27 +0430
 Saber Rezvani  wrote:
 
> Hi,
> 
> 
> I have run multi_process/symmetric_mp example in DPDK example directory.
> For a one process its throughput is line rate but as I increase the
> number of cores I see decrease in throughput. For example, If the number
> of queues set to 4 and each queue assigns to a single core, then the
> throughput will be something about 9.4. if 8 queues, then throughput
> will be 8.5.
> 
> I have read the following, but it was not convincing.
> 
> http://mails.dpdk.org/archives/dev/2015-October/024960.html
> 
> 
> I am eagerly looking forward to hearing from you, all.
> 
> 
> Best wishes,
> 
> Saber
> 
> 
 Not completely surprising. If you have more cores than packet line rate
 then the number of packets returned for each call to rx_burst will be less.
 With large number of cores, most of the time will be spent doing reads of
 PCI registers for no packets!
>>> Indeed pktgen says it is generating traffic at line rate, but receiving 
>>> less than 10 Gb/s. So, it that case there should be something that causes 
>>> the reduction in throughput :(
>>> 
>>> 
>> Regards,
>> Keith
>> 
> 
> 
> 

Regards,
Keith



Re: [dpdk-dev] [PATCH] net/ixgbe: Strip SR-IOV transparent VLANs in VF

2018-08-28 Thread Chas Williams
On Fri, Aug 24, 2018 at 12:45 PM  wrote:

> From: Robert Shearman 
>
> SR-IOV VFs support "transparent" VLANs. Traffic from/to a VM
> associated with a VF has a VLAN tag inserted/stripped in a manner
> intended to be totally transparent to the VM.  On a Linux hypervisor
> the vlan can be specified by "ip link set  vf  vlan ".
> The VM VF driver is not configured to use any VLAN and the VM should
> never see the transparent VLAN for that reason.  However, in practice
> these VLAN headers are being received by the VM which discards the
> packets as that VLAN is unknown to it.  The Linux kernel ixbge driver
> explicitly removes the VLAN in this case (presumably due to the
> hardware not being able to do this) but the DPDK driver does not.
>
> This patch mirrors the kernel driver behaviour by removing the VLAN on
> the VF side. This is done by checking the VLAN in the VFTA, where the
> hypervisor will have set the bit in the VFTA corresponding to the VLAN
> if transparent VLANs were being used for the VF. If the VLAN is set in
> the VFTA then it is known that it's a transparent VLAN case and so the
> VLAN is stripped from the mbuf. To limit any potential performance
> impact on the PF data path, the RX path is split into PF and VF
> versions with the transparent VLAN stripping only done in the VF
> path. Measurements with our application show ~2% performance hit for
> the VF case and none for the PF case.
>
> Signed-off-by: Robert Shearman 
>

Reviewed-by: Chas Williams 



> ---
>  drivers/net/ixgbe/ixgbe_ethdev.c| 18 +++
>  drivers/net/ixgbe/ixgbe_ethdev.h| 38 +++
>  drivers/net/ixgbe/ixgbe_rxtx.c  | 83
> +---
>  drivers/net/ixgbe/ixgbe_rxtx.h  | 31 +++-
>  drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c |  7 +++
>  drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c  | 84
> ++---
>  6 files changed, 238 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c
> b/drivers/net/ixgbe/ixgbe_ethdev.c
> index 26b1927..3f88a02 100644
> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
> @@ -604,7 +604,7 @@ static const struct eth_dev_ops ixgbevf_eth_dev_ops = {
> .vlan_filter_set  = ixgbevf_vlan_filter_set,
> .vlan_strip_queue_set = ixgbevf_vlan_strip_queue_set,
> .vlan_offload_set = ixgbevf_vlan_offload_set,
> -   .rx_queue_setup   = ixgbe_dev_rx_queue_setup,
> +   .rx_queue_setup   = ixgbevf_dev_rx_queue_setup,
> .rx_queue_release = ixgbe_dev_rx_queue_release,
> .rx_descriptor_done   = ixgbe_dev_rx_descriptor_done,
> .rx_descriptor_status = ixgbe_dev_rx_descriptor_status,
> @@ -1094,7 +1094,7 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void
> *init_params __rte_unused)
>  "Using default TX function.");
> }
>
> -   ixgbe_set_rx_function(eth_dev);
> +   ixgbe_set_rx_function(eth_dev, true);
>
> return 0;
> }
> @@ -1576,7 +1576,7 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev)
>  "No TX queues configured yet. Using
> default TX function.");
> }
>
> -   ixgbe_set_rx_function(eth_dev);
> +   ixgbe_set_rx_function(eth_dev, true);
>
> return 0;
> }
> @@ -1839,8 +1839,8 @@ ixgbe_vlan_filter_set(struct rte_eth_dev *dev,
> uint16_t vlan_id, int on)
> uint32_t vid_idx;
> uint32_t vid_bit;
>
> -   vid_idx = (uint32_t) ((vlan_id >> 5) & 0x7F);
> -   vid_bit = (uint32_t) (1 << (vlan_id & 0x1F));
> +   vid_idx = ixgbe_vfta_index(vlan_id);
> +   vid_bit = ixgbe_vfta_bit(vlan_id);
> vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(vid_idx));
> if (on)
> vfta |= vid_bit;
> @@ -3807,7 +3807,9 @@ ixgbe_dev_supported_ptypes_get(struct rte_eth_dev
> *dev)
>
>  #if defined(RTE_ARCH_X86)
> if (dev->rx_pkt_burst == ixgbe_recv_pkts_vec ||
> -   dev->rx_pkt_burst == ixgbe_recv_scattered_pkts_vec)
> +   dev->rx_pkt_burst == ixgbe_recv_scattered_pkts_vec ||
> +   dev->rx_pkt_burst == ixgbevf_recv_pkts_vec ||
> +   dev->rx_pkt_burst == ixgbevf_recv_scattered_pkts_vec)
> return ptypes;
>  #endif
> return NULL;
> @@ -5231,8 +5233,8 @@ ixgbevf_vlan_filter_set(struct rte_eth_dev *dev,
> uint16_t vlan_id, int on)
> PMD_INIT_LOG(ERR, "Unable to set VF vlan");
> return ret;
> }
> -   vid_idx = (uint32_t) ((vlan_id >> 5) & 0x7F);
> -   vid_bit = (uint32_t) (1 << (vlan_id & 0x1F));
> +   vid_idx = ixgbe_vfta_index(vlan_id);
> +   vid_bit = ixgbe_vfta_bit(vlan_id);
>
> /* Save what we set and retore it after device reset */
> if (on)
> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.h
> b/drivers/net/ixgbe/ixgbe_ethdev.h
> index d0b9396..483d2cd 

Re: [dpdk-dev] [RFC] ethdev: add generic MAC address rewrite actions

2018-08-28 Thread Jack MIN
On Tue, Aug 28, 2018 at 01:15:09PM +0300, Andrew Rybchenko wrote:
> On 08/07/2018 05:20 PM, Jack Min wrote:
> > There is a need to offload rewrite MAC address for both destination and 
> > source
> > from the matched flow
> > 
> > The proposed actions could make above easily achieved
> > 
> > 
> > Signed-off-by: Xiaoyu Min 
> > mailto:jack...@mellanox.com>>
> 
> The only question I have is which Ethernet header is edited, if there are
> many
> Ethernet headers in the case of tunnels. I guess the outermost, but it
> should
> be stated in the description.
> 
OK. I'll add it in V2.

Thanks
-Jack


Re: [dpdk-dev] [PATCH] net/mlx5: handle expected errno properly

2018-08-28 Thread Jack MIN
On Tue, Aug 28, 2018 at 01:42:18PM -0700, Yongseok Koh wrote:
> On Fri, Aug 24, 2018 at 02:45:00PM +0800, Jack MIN wrote:
> > On Thu, Aug 23, 2018 at 02:08:09PM -0700, Yongseok Koh wrote:
> > > On Thu, Aug 23, 2018 at 02:38:51PM +0800, Xiaoyu Min wrote:
> > > > rte_errno is a per thread variable and is widely used as an
> > > > error indicator, which means a function could affect
> > > > other functions' results by setting rte_errno carelessly
> > > > 
> > > > During rxq setup, an EINVAL rte_errno is expected since
> > > > the queues are not created yet
> > > > So rte_errno is cleared when it is EINVAL as expected
> > > > 
> > > > Signed-off-by: Xiaoyu Min 
> > > > ---
> > > >  drivers/net/mlx5/mlx5_rxq.c | 20 +++-
> > > >  1 file changed, 15 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> > > > index 1f7bfd4..e7056e8 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > > @@ -443,6 +443,7 @@
> > > > struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
> > > > struct mlx5_rxq_ctrl *rxq_ctrl =
> > > > container_of(rxq, struct mlx5_rxq_ctrl, rxq);
> > > > +   int ret = 0;
> > > >  
> > > > if (!rte_is_power_of_2(desc)) {
> > > > desc = 1 << log2above(desc);
> > > > @@ -459,13 +460,21 @@
> > > > rte_errno = EOVERFLOW;
> > > > return -rte_errno;
> > > > }
> > > > -   if (!mlx5_rxq_releasable(dev, idx)) {
> > > > +   ret = mlx5_rxq_releasable(dev, idx);
> > > > +   if (!ret) {
> > > > DRV_LOG(ERR, "port %u unable to release queue index %u",
> > > > dev->data->port_id, idx);
> > > > rte_errno = EBUSY;
> > > > return -rte_errno;
> > > > +   } else if (ret == -EINVAL) {
> > > > +   /**
> > > > +* on the first time, rx queue doesn't exist,
> > > > +* so just ignore this error and reset rte_errno.
> > > > +*/
> > > > +   rte_errno = 0;
> > > 
> > > Unless this function returns failure, the rte_errno will be ignored by 
> > > caller
> > > and caller shouldn't assume rte_errno has 0. Caller will assume it is 
> > > garbage
> > > data in case of success. So we can silently ignore this case. Does it 
> > > cause any
> > > issue in application side?
> > > 
> > Not application side but mlx5 PMD this time:
> > **mlx5_fdir_filter_delete** 
> > which just _return -rte_errno;_
> 
> Looks like an error. mlx5_fdir_filter_delete() can't be like that. We seem to
> have lost the code while refactoring it. Let take it offline.
> 
Sure ~

> > For sure, _mlx5_fdir_filter_delete_ should be more defensive, should not 
> > assume
> > rte_errno is zero if no error happened.
> > However if the caller know that an error will happen and rte_errno will 
> > become
> > meaningless (garbage) for the succeeding functions, Catching the expected 
> > error
> > and resetting rte_errno will be better. What do you think?
> 
> Still don't understand clearly. There would be many other similar cases where 
> we
> don't clear rte_errno when returning success. I don't understand why this case
> should be taken as a special one??
> 
No, the _mlx5_rxq_releasable()_ returns *error* (-EINVAL)(the rxq doesn't exist)
as my understanding
We only check if rxq is not releasable(==0) in previouse version but we didn't
check if function returns error or success
> Thanks
> Yongseok


Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

2018-08-28 Thread Honnappa Nagarahalli
I agree with Gavin here. Store to fifo->write and fifo->read can get hoisted 
resulting in accessing invalid buffer array entries or over writing of the 
buffer array entries.
IMO, we should solve this using c11 atomics. This will also help remove the use 
of 'volatile' from 'rte_kni_fifo' structure.

If you want us to put together a patch with this idea, please let us know.

Thank you,
Honnappa

From: Gavin Hu
Sent: Tuesday, August 28, 2018 2:31 PM
To: Kokkilagadda, Kiran ; Ferruh Yigit 
; Jacob, Jerin 
Cc: dev@dpdk.org; Honnappa Nagarahalli ; nd 
; Ola Liljedahl ; Steve Capper 

Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

Assuming reader and writer may execute on different CPU's, this become standard 
multithreaded programming.
We are concerned about that update the reader pointer too early(weak ordering 
may reorder it before reading from the slots), that means the slots are 
released and may immediately overwritten by the writer then you get "too new" 
data and get lost of the old data.

From: Kokkilagadda, Kiran 
mailto:kiran.kokkilaga...@cavium.com>>
Sent: Tuesday, August 28, 2018 6:44 PM
To: Gavin Hu mailto:gavin...@arm.com>>; Ferruh Yigit 
mailto:ferruh.yi...@intel.com>>; Jacob, Jerin 
mailto:jerin.jacobkollanukka...@cavium.com>>
Cc: dev@dpdk.org; Honnappa Nagarahalli 
mailto:honnappa.nagaraha...@arm.com>>
Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization


In this instance there won't be any problem, as until the value of fifo->write 
changes, this loop won't get executed. As of now we didn't see any issue with 
it and for performance reasons, we don't want to keep read barrier.




From: Gavin Hu mailto:gavin...@arm.com>>
Sent: Monday, August 27, 2018 9:10 PM
To: Ferruh Yigit; Kokkilagadda, Kiran; Jacob, Jerin
Cc: dev@dpdk.org; Honnappa Nagarahalli
Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

External Email

This fix is not complete, kni_fifo_get requires a read fence also, otherwise it 
probably gets stale data on a weak ordering platform.

> -Original Message-
> From: dev mailto:dev-boun...@dpdk.org>> On Behalf Of 
> Ferruh Yigit
> Sent: Monday, August 27, 2018 10:08 PM
> To: Kiran Kumar 
> mailto:kkokkilaga...@caviumnetworks.com>>;
> jerin.ja...@caviumnetworks.com
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer
> synchronization
>
> On 8/16/2018 10:55 AM, Kiran Kumar wrote:
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. So adding a write
> > barrier to make sure the values being synced before updating fifo_write.
> >
> > Fixes: 3fc5ca2f6352 ("kni: initial import")
> >
> > Signed-off-by: Kiran Kumar 
> > mailto:kkokkilaga...@caviumnetworks.com>>
> > Acked-by: Jerin Jacob 
> > mailto:jerin.ja...@caviumnetworks.com>>
>
> Acked-by: Ferruh Yigit mailto:ferruh.yi...@intel.com>>
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.


Re: [dpdk-dev] [PATCH] mbuf: add IGMP packet type

2018-08-28 Thread Honnappa Nagarahalli



-Original Message-
From: dev  On Behalf Of Jerin Jacob
Sent: Monday, August 27, 2018 7:39 AM
To: dev@dpdk.org
Cc: olivier.m...@6wind.com; Jerin Jacob 
Subject: [dpdk-dev] [PATCH] mbuf: add IGMP packet type

Add support for IGMP packet type.

Signed-off-by: Jerin Jacob 
---
 lib/librte_mbuf/rte_mbuf_ptype.c | 1 +
 lib/librte_mbuf/rte_mbuf_ptype.h | 8 
 2 files changed, 9 insertions(+)

diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c b/lib/librte_mbuf/rte_mbuf_ptype.c
index d7835e283..b483a609d 100644
--- a/lib/librte_mbuf/rte_mbuf_ptype.c
+++ b/lib/librte_mbuf/rte_mbuf_ptype.c
@@ -47,6 +47,7 @@ const char *rte_get_ptype_l4_name(uint32_t ptype)
case RTE_PTYPE_L4_SCTP: return "L4_SCTP";
case RTE_PTYPE_L4_ICMP: return "L4_ICMP";
case RTE_PTYPE_L4_NONFRAG: return "L4_NONFRAG";
+   case RTE_PTYPE_L4_IGMP: return "L4_IGMP";
default: return "L4_UNKNOWN";
}
 }
diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h
index 01acc66e2..00db3eeed 100644
--- a/lib/librte_mbuf/rte_mbuf_ptype.h
+++ b/lib/librte_mbuf/rte_mbuf_ptype.h
@@ -286,6 +286,14 @@ extern "C" {
  * | 'version'=6, 'next header'!=[6|17|44|132|1]>
  */
 #define RTE_PTYPE_L4_NONFRAG0x0600
+/**
+ * IGMP (Internet Group Management Protocol) packet type.
+ *
+ * Packet format:
+ * <'ether type'=0x0800
+ * | 'version'=4, 'protocol'=2, 'MF'=0, 'frag_offset'=0>  */
+#define RTE_PTYPE_L4_IGMP   0x0700
 /**
  * Mask of layer 4 packet types.
  * It is used for outer packet for tunneling cases.
--
2.18.0

Acked-by: Honnappa Nagarahalli 


Re: [dpdk-dev] [PATCH v1] testpmd: add new command for show port info

2018-08-28 Thread Rami Rosen
Hi,
Thanks for this patch, this patch is something needed.

It seems to me that adding get_valid_ports() is not necessary, as you have
rte_eth_dev_count_avail(), which is implemented the same; please look
in rte_ethdev.[c,h; and actually, testpmd uses
rte_eth_dev_count_avail(), for example, in attach_port() in testpmd.c.
Besides, it should return uint16_t and not int, as DPDK ports are 16
bits, and count is defined as uint16_t.

+int get_valid_ports(void)
+{
+ portid_t pid;
+ uint16_t count = 0;
+
+ RTE_ETH_FOREACH_DEV(pid) {
+ count ++;
+ }
+ return count;
+}

and in rte_ethdev.c you have:
uint16_t
rte_eth_dev_count_avail(void)
{
uint16_t p;
uint16_t count;

count = 0;

RTE_ETH_FOREACH_DEV(p)
count++;

return count;
}


Regards,
Rami Rosen


Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization

2018-08-28 Thread Kokkilagadda, Kiran
Agreed. Please go a head and make the changes. You need to make same change in 
kernel side also. And please use c11 ring (see rte_ring) mechanism so that it 
won't impact other platforms like intel. We need this change just for arm and 
ppc.



From: Honnappa Nagarahalli 
Sent: Wednesday, August 29, 2018 10:29 AM
To: Gavin Hu; Kokkilagadda, Kiran; Ferruh Yigit; Jacob, Jerin
Cc: dev@dpdk.org; nd; Ola Liljedahl; Steve Capper
Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization


External Email

I agree with Gavin here. Store to fifo->write and fifo->read can get hoisted 
resulting in accessing invalid buffer array entries or over writing of the 
buffer array entries.

IMO, we should solve this using c11 atomics. This will also help remove the use 
of ‘volatile’ from ‘rte_kni_fifo’ structure.



If you want us to put together a patch with this idea, please let us know.



Thank you,

Honnappa



From: Gavin Hu
Sent: Tuesday, August 28, 2018 2:31 PM
To: Kokkilagadda, Kiran ; Ferruh Yigit 
; Jacob, Jerin 
Cc: dev@dpdk.org; Honnappa Nagarahalli ; nd 
; Ola Liljedahl ; Steve Capper 

Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization



Assuming reader and writer may execute on different CPU's, this become standard 
multithreaded programming.

We are concerned about that update the reader pointer too early(weak ordering 
may reorder it before reading from the slots), that means the slots are 
released and may immediately overwritten by the writer then you get “too new” 
data and get lost of the old data.



From: Kokkilagadda, Kiran 
mailto:kiran.kokkilaga...@cavium.com>>
Sent: Tuesday, August 28, 2018 6:44 PM
To: Gavin Hu mailto:gavin...@arm.com>>; Ferruh Yigit 
mailto:ferruh.yi...@intel.com>>; Jacob, Jerin 
mailto:jerin.jacobkollanukka...@cavium.com>>
Cc: dev@dpdk.org; Honnappa Nagarahalli 
mailto:honnappa.nagaraha...@arm.com>>
Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization



In this instance there won't be any problem, as until the value of fifo->write 
changes, this loop won't get executed. As of now we didn't see any issue with 
it and for performance reasons, we don't want to keep read barrier.







From: Gavin Hu mailto:gavin...@arm.com>>
Sent: Monday, August 27, 2018 9:10 PM
To: Ferruh Yigit; Kokkilagadda, Kiran; Jacob, Jerin
Cc: dev@dpdk.org; Honnappa Nagarahalli
Subject: RE: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer synchronization



External Email

This fix is not complete, kni_fifo_get requires a read fence also, otherwise it 
probably gets stale data on a weak ordering platform.

> -Original Message-
> From: dev mailto:dev-boun...@dpdk.org>> On Behalf Of 
> Ferruh Yigit
> Sent: Monday, August 27, 2018 10:08 PM
> To: Kiran Kumar 
> mailto:kkokkilaga...@caviumnetworks.com>>;
> jerin.ja...@caviumnetworks.com
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2] kni: fix kni Rx fifo producer
> synchronization
>
> On 8/16/2018 10:55 AM, Kiran Kumar wrote:
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. So adding a write
> > barrier to make sure the values being synced before updating fifo_write.
> >
> > Fixes: 3fc5ca2f6352 ("kni: initial import")
> >
> > Signed-off-by: Kiran Kumar 
> > mailto:kkokkilaga...@caviumnetworks.com>>
> > Acked-by: Jerin Jacob 
> > mailto:jerin.ja...@caviumnetworks.com>>
>
> Acked-by: Ferruh Yigit mailto:ferruh.yi...@intel.com>>
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.


Re: [dpdk-dev] [RFC v2] ethdev: support metadata as flow rule criteria

2018-08-28 Thread Dekel Peled



> -Original Message-
> From: Yongseok Koh
> Sent: Tuesday, August 28, 2018 10:44 PM
> To: Dekel Peled 
> Cc: dev ; Shahaf Shuler ; Ori Kam
> ; Andrew Rybchenko ;
> Yigit, Ferruh ; Thomas Monjalon
> ; Ananyev, Konstantin
> ; Adrien Mazarguil
> ; Olivier Matz ;
> Alex Rosenbaum 
> Subject: Re: [RFC v2] ethdev: support metadata as flow rule criteria
> 
> > On Aug 26, 2018, at 7:09 AM, Dekel Peled  wrote:
> >
> > Current implementation of rte_flow allows match pattern of flow rule,
> > based on packet data or header fields.
> > This limits the application use of match patterns.
> >
> > For example, consider a vswitch application which controls a set of
> > VMs, connected with virtio, in a fabric with overlay of VXLAN.
> > Several VMs can have the same inner tuple, while the outer tuple is
> > different and controlled by the vswitch (encap action).
> > For the vswtich to be able to offload the rule to the NIC, it must use
> > a unique match criteria, independent from the inner tuple, to perform
> > the encap action.
> >
> > This RFC adds support for additional metadata to use as match pattern.
> > The metadata is an opaque item, fully controlled by the application.
> >
> > The use of metadata is relevant for egress rules only.
> > It can be set in the flow rule using the RTE_FLOW_ITEM_META.
> >
> > In order to avoid change in mbuf API, exisitng field mbuf.hash.fdir.hi
> > will be used to carry the metadata item. This field is used only in
> > ingress packets, so using it for egress metadata will not cause conflicts.
> >
> > Application should set the packet metdata in the mbuf dedicated field,
> > and set the PKT_TX_METADATA flag in the mbuf->ol_flags.
> > The NIC will use the packet metadata as match criteria for relevant
> > flow rules.
> >
> > For example, to do an encap action depending on the VM id, the
> > application needs to configure 'match on metadata' rte_flow rule with
> > VM id as metadata, along with desired encap action.
> > When preparing an egress data packet, application will set VM id data
> > in mbuf dedicated field, and set PKT_TX_METADATA flag.
> >
> > PMD will send data packets to NIC, with VM id as metadata.
> > Egress flow on NIC will match metadata as done with other criteria.
> > Upon match on metadata (VM id) the appropriate encap action will be
> > performed.
> >
> > This RFC introduces metadata item type for rte_flow
> > RTE_FLOW_ITEM_META, along with corresponding struct
> rte_flow_item_meta
> > and ol_flag PKT_TX_METADATA.
> >
> > Comments are welcome.
> >
> > Signed-off-by: Dekel Peled 
> > ---
> > v2: Use existing field in mbuf for metadata item, as suggested, instead
> >of adding a new field.
> >Metadata item size adjusted to 32 bits.
> > ---
> > doc/guides/prog_guide/rte_flow.rst | 21 +
> > lib/librte_ethdev/rte_flow.c   |  1 +
> > lib/librte_ethdev/rte_flow.h   | 25 +
> > lib/librte_mbuf/rte_mbuf.h | 13 +
> > 4 files changed, 60 insertions(+)
> >
> > diff --git a/doc/guides/prog_guide/rte_flow.rst
> > b/doc/guides/prog_guide/rte_flow.rst
> > index b305a72..560e45a 100644
> > --- a/doc/guides/prog_guide/rte_flow.rst
> > +++ b/doc/guides/prog_guide/rte_flow.rst
> > @@ -1191,6 +1191,27 @@ Normally preceded by any of:
> > - `Item: ICMP6_ND_NS`_
> > - `Item: ICMP6_ND_OPT`_
> >
> > +Item: ``META``
> > +^^
> > +
> > +Matches an application specific 32 bit metadata item.
> > +
> > +- Default ``mask`` matches any 32 bit value.
> > +
> > +.. _table_rte_flow_item_meta:
> > +
> > +.. table:: META
> > +
> > +   +--+--+---+
> > +   | Field| Subfield | Value |
> > +   +==+==+===+
> > +   | ``spec`` | ``data`` | 32 bit metadata value |
> > +   +--+--+
> > +   | ``last`` | ``data`` | upper range value |
> > +   +--+--+---+
> > +   | ``mask`` | ``data`` | zeroed to match any value |
> > +   +--+--+---+
> > +
> > Actions
> > ~~~
> >
> > diff --git a/lib/librte_ethdev/rte_flow.c
> > b/lib/librte_ethdev/rte_flow.c index cff4b52..54e5ef8 100644
> > --- a/lib/librte_ethdev/rte_flow.c
> > +++ b/lib/librte_ethdev/rte_flow.c
> > @@ -66,6 +66,7 @@ struct rte_flow_desc_data {
> >  sizeof(struct rte_flow_item_icmp6_nd_opt_sla_eth)),
> > MK_FLOW_ITEM(ICMP6_ND_OPT_TLA_ETH,
> >  sizeof(struct rte_flow_item_icmp6_nd_opt_tla_eth)),
> > +   MK_FLOW_ITEM(META, sizeof(struct rte_flow_item_meta)),
> > };
> >
> > /** Generate flow_action[] entry. */
> > diff --git a/lib/librte_ethdev/rte_flow.h
> > b/lib/librte_ethdev/rte_flow.h index f8ba71c..eba3cc4 100644
> > --- a/lib/librte_ethdev/rte_flow.h
> > +++ b/lib/librte_ethdev/rte_flow.h
> > @@ -413,6 +413,15 @@ enum rte_flow_item_type {
> >  * See struct rte_flow_item_mark.
> >  */