Re: [dpdk-dev] [PATCH v2] vfio: noiommu check error handling

2018-01-13 Thread Burakov, Anatoly

On 31-Oct-17 3:59 PM, Jonas Pfefferle wrote:

Check and report errors on open/read in noiommu check.

Signed-off-by: Jonas Pfefferle 
---


LGTM

Acked-by: Anatoly Burakov 

--
Thanks,
Anatoly


Re: [dpdk-dev] [PATCH v2] vfio: noiommu check error handling

2018-01-13 Thread Burakov, Anatoly

On 11-Jan-18 11:45 PM, Thomas Monjalon wrote:

07/11/2017 10:50, Jonas Pfefferle1:

Is there something urgent for 17.11?
Or can it be refined in 18.02?


Nothing urgent. We can refine this for 18.02.


Anatoly, any thought?


Anatoly, Jonas, how do you want to proceed with this patch?



I don't see anything to be refined here, it's a simple bug fix - code 
assumes noiommu mode support is always available, when it might not be 
the case on older kernels.


--
Thanks,
Anatoly


Re: [dpdk-dev] [PATCH v2 1/4] eal: add channel for multi-process communication

2018-01-13 Thread Burakov, Anatoly

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:

diff --git a/lib/librte_eal/common/eal_common_proc.c 
b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..d700e9e 100644





+
  int
  rte_eal_primary_proc_alive(const char *config_file_path)
  {
@@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
  
  	return !!ret;

  }
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+   int len = strlen(name);


Why do strlen() here? You already have MAX_ACTION_NAME_LEN, strncmp will 
take care of the rest, no?



+   struct action_entry *entry;
+
+   TAILQ_FOREACH(entry, &action_entry_list, next) {
+   if (strncmp(entry->action_name, name, len) == 0)
+   break;
+   }
+
+   return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+   struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+   if (entry == NULL) {
+   rte_errno = -ENOMEM;
+   return -1;
+   }
+
+   if (strlen(action_name) > MAX_ACTION_NAME_LEN) {
+   rte_errno = -E2BIG;
+   return -1;
+   }


strnlen perhaps? strnlen(action_name) == MAX_ACTION_NAME_LEN will be an 
error condition, and unlike strlen you won't have to scan the entire 
memory if your string was corrupted.



+
+   pthread_mutex_lock(&mp_mutex_action);
+   if (find_action_entry_by_name(action_name) != NULL) {
+   free(entry);
+   rte_errno = -EEXIST;
+   return -1;
+   }
+   strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+   entry->action = action;
+   TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+   pthread_mutex_unlock(&mp_mutex_action);
+   return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{


name == NULL? You do a find_action_entry_by_name with it, which calls 
strlen, which IIRC segfaults on NULL pointer. Also, maybe add an strlen 
(or better yet, strnlen) check here like in action_register, so that 
find_action_entry_by_name doesn't need to care about string lengths and 
can work off MAX_ACTION_NAME_LEN instead.



+   struct action_entry *entry;
+
+   pthread_mutex_lock(&mp_mutex_action);
+   entry = find_action_entry_by_name(name);


entry == NULL?


+   TAILQ_REMOVE(&action_entry_list, entry, next);
+   free(entry);
+   pthread_mutex_unlock(&mp_mutex_action);
+}
+
+static int
+read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+{
+   int ret;
+   struct iovec iov;
+   struct msghdr msgh;





+   return -1;
+   }
+
+   params_len = len - sizeof(struct mp_msghdr);
+   ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+   pthread_mutex_unlock(&mp_mutex_action);
+   return ret;
+


unnecessary newline.


+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+   int len;
+   int fds[SCM_MAX_FD];
+   char buf[MAX_MSG_LENGTH];
+
+   while (1) {





+   goto error;
+   }
+
+   return 0;
+error:
+   close(mp_fd);
+   mp_fd = -1;
+   return -1;
+}
+
+static inline struct mp_msghdr *
+format_msg(const char *act_name, const void *p, int len_params, int fds_num)


The name is slightly misleading, as this function actually *creates* a 
message, not just formats it. create_msg? alloc_msg?



+{
+   int len_msg;
+   struct mp_msghdr *msg;
+
+   len_msg = sizeof(struct mp_msghdr) + len_params;
+   if (len_msg > MAX_MSG_LENGTH) {
+   RTE_LOG(ERR, EAL, "Message is too long\n");
+   rte_errno = -EINVAL;
+   return NULL;
+   }
+
+   msg = malloc(len_msg);
+   if (!msg) {
+   RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+   rte_errno = -ENOMEM;
+   return NULL;
+   }
+   memset(msg, 0, len_msg);
+   strcpy(msg->action_name, act_name);


strncpy?


+   msg->fds_num = fds_num;
+   msg->len_params = len_params;
+   memcpy(msg->params, p, len_params);
+   return msg;
+}
+
+static int
+send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
+{
+   int ret;
+   struct msghdr msgh;
+   struct iovec iov;
+   size_t fd_size = msg->fds_num * sizeof(int);





+   return mp_send(action_name, params, len_params, fds, fds_num);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h 
b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..e95399b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
  }
  
+/** Path of primary/secondary communication unix socket file. */

+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)


perhaps eal_mp_socket_path would've been more descriptive? API doesn't 
need to care what 

Re: [dpdk-dev] [PATCH v2 2/4] eal: add and del secondary processes in the primary

2018-01-13 Thread Burakov, Anatoly

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:

By the multi-process channel, we add an mp action named "proc".

As a secondary process starts, it sends a "proc add" message to
the primary.

As the primary finds a failure in sending message to a specific
secondary process, that secondary process is treated as exited;
and we remove it from the secondary array by sending a "proc del"
message to the primary itself.

Test:
   1. Start the primary and the secondary process
 $ (testpmd) -c 0x3 -n 4 -- -i
 $ (helloworld) -c 0xc -n 4 --proc-type=auto --

   2. Check the log of testpmd:
 ...
 EAL: bind to /var/run/.rte_unix
 ...
 EAL: add secondary: /var/run/.testpmd_unix_(xxx)
 ...

   3. Check the log of helloworld:
 ...
 EAL: bind to /var/run/.testpmd_unix_xxx
 EAL: bind to /var/run/.testpmd_unix_c_xxx
 ...


it says "unix" all over the place, but that's an internal implementation 
detail. "mp_socket" or similar should do, no?




Signed-off-by: Jianfeng Tan 
---
  lib/librte_eal/common/eal_common_proc.c | 88 -
  1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c 
b/lib/librte_eal/common/eal_common_proc.c
index d700e9e..70519cc 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -54,6 +54,13 @@ struct mp_msghdr {
char params[0];
  } __rte_packed;
  
+struct proc_request {

+#define MP_PROC_ADD0
+#define MP_PROC_DEL1
+   int type;
+   char path[MAX_UNIX_PATH_LEN];
+};
+
  int
  rte_eal_primary_proc_alive(const char *config_file_path)
  {
@@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
return NULL;
  }
  
+static int

+add_sec_proc(const char *path)
+{
+   int i;
+
+   for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+   if (mp_sec_sockets[i] == NULL)
+   break;
+   if (i < MAX_SECONDARY_PROCS)
+   mp_sec_sockets[i] = strdup(path);
+
+   return i < MAX_SECONDARY_PROCS;
+}


While it's equivalent, the intent behind this isn't clear, it's 
needlessly complicating the more common idiom of


for (i = 0; i < MAX; i++) {}
if (i == MAX)
   return error;
do_something;
return success;


+
+static int
+del_sec_proc(const char *path)
+{
+   int i;
+
+   for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+   if (!strcmp(mp_sec_sockets[i], path)) {
+   free(mp_sec_sockets[i]);
+   mp_sec_sockets[i] = NULL;
+   break;
+   }
+   }
+
+   return i < MAX_SECONDARY_PROCS;
+}


Same as above - maybe rewrite it as a more commonly used idiom. Also, 
you probably want to use strncmp(), and check for NULL pointers, IIRC 
strncmp(NULL) is undefined behavior.



+
+static int
+mp_primary_proc(const void *params,
+   int len __rte_unused,
+   int fds[] __rte_unused,
+   int fds_num __rte_unused)
+{
+   const struct proc_request *r = (const struct proc_request *)params;
+
+   switch (r->type) {
+   case MP_PROC_ADD:
+   RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
+   return add_sec_proc(r->path);
+   case MP_PROC_DEL:
+   RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
+   return del_sec_proc(r->path);
+   default:
+   RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
+   }
+
+   return -1;
+}
+
  static inline const char *
  get_unix_path(int is_server)
  {
@@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
if (mp_fd < 0)
return -1;
  
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {

+   if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
+   RTE_LOG(ERR, EAL, "failed to register handler\n");
+   goto error;
+   }
+   } else {
+   struct proc_request r;
+
+   r.type = MP_PROC_ADD;
+   snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));


Nitpicking, but maybe just send PID instead of the whole path? 
Primary/secondary share their prefix and most of their socket path 
anyway, so the real difference is the PID. This would also eliminate the 
need for using strings in many places.



+   if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
+   RTE_LOG(ERR, EAL, "failed to add into primary\n");
+   goto error;
+   }
+   }
+
if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
strerror(errno));
@@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr 
*msg, int fds[])
if (ret < 0) {
RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
  
-		if (rte_eal_process_type() == RT

Re: [dpdk-dev] [PATCH v2 3/4] eal: add synchronous multi-process communication

2018-01-13 Thread Burakov, Anatoly

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:

---
  lib/librte_eal/common/eal_common_proc.c | 144 +---
  lib/librte_eal/common/include/rte_eal.h |  73 +++-
  lib/librte_eal/rte_eal_version.map  |   2 +
  3 files changed, 206 insertions(+), 13 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c 
b/lib/librte_eal/common/eal_common_proc.c
index 70519cc..f194a52 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -32,6 +32,7 @@
  static int mp_fd = -1;
  static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
  static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;
  
  struct action_entry {

TAILQ_ENTRY(action_entry) next;  /**< Next attached action entry */
@@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =
  
  struct mp_msghdr {

char action_name[MAX_ACTION_NAME_LEN];
+#define MP_MSG 0 /* Share message with peers, will not block */
+#define MP_REQ 1 /* Request for information, Will block for a reply */
+#define MP_REP 2 /* Reply to previously-received request */


nitpicking, but... response instead of reply?


+   int type;
int fds_num;
int len_params;
char params[0];
@@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
  }
  
  static int

-read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+read_msg(int fd, char *buf, int buflen,
+int *fds, int fds_num, struct sockaddr_un *s)





+   return mp_send(action_name, params, len_params,
+   fds, fds_num, MP_MSG, NULL);
+}
+
+int
+rte_eal_mp_request(const char *action_name,
+  void *params,
+  int len_p,
+  int fds[],
+  int fds_in,
+  int fds_out)


name == NULL? name too long?


+{
+   int i, j;
+   int sockfd;
+   int nprocs;
+   int ret = 0;
+   struct mp_msghdr *req;
+   struct timeval tv;
+   char buf[MAX_MSG_LENGTH];
+   struct mp_msghdr *hdr;
+
+   RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+   if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+   RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+   rte_errno = -E2BIG;


(this also applies to previous patches) you set rte_errno to -EINVAL in 
format_msg when message with parameters is too big - should that be 
setting -E2BIG as well? Also, maybe not set rte_errno in multiple 
places, and put all parameter checking (or at least errno setting) in 
rte_eal_mp_* functions?



+   return 0;
+   }
+
+   req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+   if (req == NULL)
+   return 0;
+
+   if ((sockfd = open_unix_fd(0)) < 0) {
+   free(req);
+   return 0;
+   }
+
+   tv.tv_sec = 5;  /* 5 Secs Timeout */
+   tv.tv_usec = 0;
+   if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+   (const void *)&tv, sizeof(struct timeval)) < 0)
+   RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
+
+   /* Only allow one req at a time */
+   pthread_mutex_lock(&mp_mutex_request);
+
+   if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+   nprocs = 0;
+   for (i = 0; i < MAX_SECONDARY_PROCS; ++i)


What follows is a bit confusing, some comments explaining what happens 
and maybe more informative variable names would've been helpful.



+   if (!mp_sec_sockets[i]) {
+   j = i;
+   nprocs++;
+   }
+
+   if (nprocs > 1) {
+   RTE_LOG(ERR, EAL,
+   "multi secondary processes not supported\n");
+   goto free_and_ret;
+   }
+





--
Thanks,
Anatoly


Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()

2018-01-13 Thread Wiles, Keith


> On Dec 18, 2017, at 9:34 AM, Konstantin Ananyev 
>  wrote:
> 
> Simple functional test for rte_smp_mb() implementations.
> Also when executed on a single lcore could be used as rough
> estimation how many cycles particular implementation of rte_smp_mb()
> might take.
> 
> Signed-off-by: Konstantin Ananyev 
> ---
> test/test/Makefile  |   1 +
> test/test/test_mb.c | 315 
> 2 files changed, 316 insertions(+)
> create mode 100644 test/test/test_mb.c
> 
> diff --git a/test/test/Makefile b/test/test/Makefile
> index bb54c9808..3134283a8 100644
> --- a/test/test/Makefile
> +++ b/test/test/Makefile
> @@ -95,6 +95,7 @@ SRCS-y += test_spinlock.c
> SRCS-y += test_memory.c
> SRCS-y += test_memzone.c
> SRCS-y += test_bitmap.c
> +SRCS-y += test_mb.c
> 
> SRCS-y += test_ring.c
> SRCS-y += test_ring_perf.c
> diff --git a/test/test/test_mb.c b/test/test/test_mb.c
> new file mode 100644
> index 0..52c73fb6b
> --- /dev/null
> +++ b/test/test/test_mb.c
> @@ -0,0 +1,315 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + *   notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + *   notice, this list of conditions and the following disclaimer in
> + *   the documentation and/or other materials provided with the
> + *   distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + *   contributors may be used to endorse or promote products derived
> + *   from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */


SPDX header should be used, correct?

> +
> + /*
> +  * This is a simple functional test for rte_smp_mb() implementation.
> +  * I.E. make sure that LOAD and STORE operations that precede the
> +  * rte_smp_mb() call are globally visible across the lcores
> +  * before the the LOAD and STORE operations that follows it.
> +  * The test uses simple implementation of Peterson's lock algorithm
> +  * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
> +  * for two execution units to make sure that rte_smp_mb() prevents
> +  * store-load reordering to happen.
> +  * Also when executed on a single lcore could be used as a approxiamate
> +  * estimation of number of cycles particular implementation of rte_smp_mb()
> +  * will take.
> +  */
> +
> 

Regards,
Keith



Re: [dpdk-dev] [PATCH v2 1/2] test/test: introduce new test-case for rte_smp_mb()

2018-01-13 Thread Wiles, Keith


> On Dec 18, 2017, at 9:34 AM, Konstantin Ananyev 
>  wrote:
> 
> Simple functional test for rte_smp_mb() implementations.
> Also when executed on a single lcore could be used as rough
> estimation how many cycles particular implementation of rte_smp_mb()
> might take.
> 
> Signed-off-by: Konstantin Ananyev 
> ---
> test/test/Makefile  |   1 +
> test/test/test_mb.c | 315 
> 2 files changed, 316 insertions(+)
> create mode 100644 test/test/test_mb.c
> 
> diff --git a/test/test/Makefile b/test/test/Makefile
> index bb54c9808..3134283a8 100644
> --- a/test/test/Makefile
> +++ b/test/test/Makefile
> @@ -95,6 +95,7 @@ SRCS-y += test_spinlock.c
> SRCS-y += test_memory.c
> SRCS-y += test_memzone.c
> SRCS-y += test_bitmap.c
> +SRCS-y += test_mb.c
> 
> SRCS-y += test_ring.c
> SRCS-y += test_ring_perf.c
> diff --git a/test/test/test_mb.c b/test/test/test_mb.c
> new file mode 100644
> index 0..52c73fb6b
> --- /dev/null
> +++ b/test/test/test_mb.c
> @@ -0,0 +1,315 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + *   notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + *   notice, this list of conditions and the following disclaimer in
> + *   the documentation and/or other materials provided with the
> + *   distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + *   contributors may be used to endorse or promote products derived
> + *   from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */


SPDX header should be used, correct?

> +
> + /*
> +  * This is a simple functional test for rte_smp_mb() implementation.
> +  * I.E. make sure that LOAD and STORE operations that precede the
> +  * rte_smp_mb() call are globally visible across the lcores
> +  * before the the LOAD and STORE operations that follows it.
> +  * The test uses simple implementation of Peterson's lock algorithm
> +  * (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
> +  * for two execution units to make sure that rte_smp_mb() prevents
> +  * store-load reordering to happen.
> +  * Also when executed on a single lcore could be used as a approxiamate
> +  * estimation of number of cycles particular implementation of rte_smp_mb()
> +  * will take.
> +  */
> +
> 

Regards,
Keith



Re: [dpdk-dev] [PATCH v2 4/4] vfio: use the generic multi-process channel

2018-01-13 Thread Burakov, Anatoly

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:




-   }
-   /* fall-through on error */
-   default:
-   RTE_LOG(ERR, EAL, "  cannot get container fd!\n");
-   close(socket_fd);
-   return -1;
-   }
+   vfio_group_fd = -1;
+   ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_group_fd, 0, 1);
+   if (ret > 0 && p.result == SOCKET_OK) {


Thanks, this looks much more clear than the previous revision! In an 
ideal world we would've been able to have separate response and reply 
(as it's perfectly possible to imagine a situation where the request 
would be small but the response would be huge), but for now this works 
as well. Maybe put this API down under EXPERIMENTAL tag? (btw wasn't 
this official policy now?)



+   cur_grp->group_no = iommu_group_no;
+   cur_grp->fd = vfio_group_fd;
+   vfio_cfg.vfio_active_groups++;
+   return vfio_group_fd;
}
+
+   RTE_LOG(ERR, EAL, "  cannot request group fd\n");
return -1;


check for SOCKET_NO_FD? Previously, that branch returned 0, now it will 
return -1.



  }
  
@@ -200,7 +174,8 @@ int

  clear_group(int vfio_group_fd)
  {
int i;
-   int socket_fd, ret;
+   int ret;
+   struct vfio_mp_param p;
  
  	if (internal_config.process_type == RTE_PROC_PRIMARY) {
  
@@ -214,43 +189,14 @@ clear_group(int vfio_group_fd)

return 0;
}
  
-	/* This is just for SECONDARY processes */

-   socket_fd = vfio_mp_sync_connect_to_primary();
-
-   if (socket_fd < 0) {
-   RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
-   return -1;
-   }
-
-   if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
-   RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
-   close(socket_fd);
-   return -1;
-   }
-
-   if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
-   RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
-   close(socket_fd);
-   return -1;
-   }
+   p.req = SOCKET_CLR_GROUP;
+   p.group_no = vfio_group_fd;
  
-	ret = vfio_mp_sync_receive_request(socket_fd);

-   switch (ret) {
-   case SOCKET_NO_FD:
-   RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
-   close(socket_fd);
-   break;
-   case SOCKET_OK:
-   close(socket_fd);
+   ret = rte_eal_mp_request("vfio", &p, sizeof(p), NULL, 0, 0);
+   if (ret > 0 && p.result == SOCKET_OK)
return 0;
-   case SOCKET_ERR:
-   RTE_LOG(ERR, EAL, "  Socket error\n");
-   close(socket_fd);
-   break;
-   default:
-   RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
-   close(socket_fd);
-   }
+
+   RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");


The error message lumps together two cases - bad VFIO group fd, and a 
socket error.



return -1;
  }
  
@@ -561,6 +507,7 @@ int

  vfio_get_container_fd(void)
  {
int ret, vfio_container_fd;
+   struct vfio_mp_param p;
  
  	/* if we're in a primary process, try to open the container */

if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +538,19 @@ vfio_get_container_fd(void)
}





--
Thanks,
Anatoly


Re: [dpdk-dev] [RFC v2 00/23] Dynamic memory allocation for DPDK

2018-01-13 Thread Burakov, Anatoly

On 19-Dec-17 11:14 AM, Anatoly Burakov wrote:

This patchset introduces a prototype implementation of dynamic memory allocation
for DPDK. It is intended to start a conversation and build consensus on the best
way to implement this functionality. The patchset works well enough to pass all
unit tests, and to work with traffic forwarding, provided the device drivers are
adjusted to ensure contiguous memory allocation where it matters.

The vast majority of changes are in the EAL and malloc, the external API
disruption is minimal: a new set of API's are added for contiguous memory
allocation (for rte_malloc and rte_memzone), and a few API additions in
rte_memory. Every other API change is internal to EAL, and all of the memory
allocation/freeing is handled through rte_malloc, with no externally visible
API changes, aside from a call to get physmem layout, which no longer makes
sense given that there are multiple memseg lists.

Quick outline of all changes done as part of this patchset:

  * Malloc heap adjusted to handle holes in address space
  * Single memseg list replaced by multiple expandable memseg lists
  * VA space for hugepages is preallocated in advance
  * Added dynamic alloc/free for pages, happening as needed on malloc/free
  * Added contiguous memory allocation API's for rte_malloc and rte_memzone
  * Integrated Pawel Wodkowski's patch [1] for registering/unregistering memory
with VFIO

The biggest difference is a "memseg" now represents a single page (as opposed to
being a big contiguous block of pages). As a consequence, both memzones and
malloc elements are no longer guaranteed to be physically contiguous, unless
the user asks for it. To preserve whatever functionality that was dependent
on previous behavior, a legacy memory option is also provided, however it is
expected to be temporary solution. The drivers weren't adjusted in this 
patchset,
and it is expected that whoever shall test the drivers with this patchset will
modify their relevant drivers to support the new set of API's. Basic testing
with forwarding traffic was performed, both with UIO and VFIO, and no 
performance
degradation was observed.

Why multiple memseg lists instead of one? It makes things easier on a number of
fronts. Since memseg is a single page now, the list will get quite big, and we
need to locate pages somehow when we allocate and free them. We could of course
just walk the list and allocate one contiguous chunk of VA space for memsegs,
but i chose to use separate lists instead, to speed up many operations with the
list.

It would be great to see the following discussions within the community 
regarding
both current implementation and future work:

  * Any suggestions to improve current implementation. The whole system with
multiple memseg lists is kind of unweildy, so maybe there are better ways to
do the same thing. Maybe use a single list after all? We're not expecting
malloc/free on hot path, so maybe it doesn't matter that we have to walk
the list of potentially thousands of pages?
  * Pluggable memory allocators. Right now, allocators are hardcoded, but down
the line it would be great to have custom allocators (e.g. for externally
allocated memory). I've tried to keep the memalloc API minimal and generic
enough to be able to easily change it down the line, but suggestions are
welcome. Memory drivers, with ops for alloc/free etc.?
  * Memory tagging. This is related to previous item. Right now, we can only ask
malloc to allocate memory by page size, but one could potentially have
different memory regions backed by pages of similar sizes (for example,
locked 1G pages, to completely avoid TLB misses, alongside regular 1G 
pages),
and it would be good to have that kind of mechanism to distinguish between
different memory types available to a DPDK application. One could, for 
example,
tag memory by "purpose" (i.e. "fast", "slow"), or in other ways.
  * Secondary process implementation, in particular when it comes to allocating/
freeing new memory. Current plan is to make use of RPC mechanism proposed by
Jianfeng [2] to communicate between primary and secondary processes, however
other suggestions are welcome.
  * Support for non-hugepage memory. This work is planned down the line. Aside
from obvious concerns about physical addresses, 4K pages are small and will
eat up enormous amounts of memseg list space, so my proposal would be to
allocate 4K pages in bigger blocks (say, 2MB).
  * 32-bit support. Current implementation lacks it, and i don't see a trivial
way to make it work if we are to preallocate huge chunks of VA space in
advance. We could limit it to 1G per page size, but even that, on multiple
sockets, won't work that well, and we can't know in advance what kind of
memory user will try to allocate. Drop it? Leave it in legacy mode only?
  * Preallocation. Right now, malloc will free any and all memory that 

Re: [dpdk-dev] [PATCH v4] net/i40e: fix port segmentation fault when restart

2018-01-13 Thread Zhang, Helin


> -Original Message-
> From: Zhao1, Wei
> Sent: Friday, January 12, 2018 2:59 PM
> To: dev@dpdk.org
> Cc: Zhang, Helin; Zhao1, Wei
> Subject: [PATCH v4] net/i40e: fix port segmentation fault when restart
> 
> This patch will go into the process of clear all queue region related
> configuration when dev stop even if there is no queue region command before,
> so this is a bug, it may cause error. So add code to check if there is queue
> configuration exist when flush queue region config and remove this process
> when device stop. Queue region clear only do when device initialization or
> PMD get flush command.
> 
> Fixes: 7cbecc2f742 ("net/i40e: support queue region set and flush")
> 
> Signed-off-by: Wei Zhao 
> Acked-by: Qi Zhang 
Applied to dpdk-next-net-intel, with minor commit log changes. Thanks!
BTW, I replied v3 email with a mistake, though v4 just modifed the commit log a 
bit.
Sorry for any confusing!

/Helin
> 
> ---
> 
> v2:
> -fix patch check warning.
> 
> v3:
> -add more log message.
> 
> v4:
> -add more log message.
> ---
>  drivers/net/i40e/i40e_ethdev.c  |  3 ---  drivers/net/i40e/rte_pmd_i40e.c | 
> 27
> ++-
>  2 files changed, 14 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
> index d80671a..5d1b916 100644
> --- a/drivers/net/i40e/i40e_ethdev.c
> +++ b/drivers/net/i40e/i40e_ethdev.c
> @@ -2144,9 +2144,6 @@ i40e_dev_stop(struct rte_eth_dev *dev)
>   /* reset hierarchy commit */
>   pf->tm_conf.committed = false;
> 
> - /* Remove all the queue region configuration */
> - i40e_flush_queue_region_all_conf(dev, hw, pf, 0);
> -
>   hw->adapter_stopped = 1;
>  }
> 
> diff --git a/drivers/net/i40e/rte_pmd_i40e.c
> b/drivers/net/i40e/rte_pmd_i40e.c index 55ae2fe..2cb22d4 100644
> --- a/drivers/net/i40e/rte_pmd_i40e.c
> +++ b/drivers/net/i40e/rte_pmd_i40e.c
> @@ -2816,22 +2816,23 @@ i40e_flush_queue_region_all_conf(struct
> rte_eth_dev *dev,
>   return 0;
>   }
> 
> - info->queue_region_number = 1;
> - info->region[0].queue_num = main_vsi->nb_used_qps;
> - info->region[0].queue_start_index = 0;
> + if (info->queue_region_number) {
> + info->queue_region_number = 1;
> + info->region[0].queue_num = main_vsi->nb_used_qps;
> + info->region[0].queue_start_index = 0;
> 
> - ret = i40e_vsi_update_queue_region_mapping(hw, pf);
> - if (ret != I40E_SUCCESS)
> - PMD_DRV_LOG(INFO, "Failed to flush queue region mapping.");
> -
> - ret = i40e_dcb_init_configure(dev, TRUE);
> - if (ret != I40E_SUCCESS) {
> - PMD_DRV_LOG(INFO, "Failed to flush dcb.");
> - pf->flags &= ~I40E_FLAG_DCB;
> - }
> + ret = i40e_vsi_update_queue_region_mapping(hw, pf);
> + if (ret != I40E_SUCCESS)
> + PMD_DRV_LOG(INFO, "Failed to flush queue region
> mapping.");
> 
> - i40e_init_queue_region_conf(dev);
> + ret = i40e_dcb_init_configure(dev, TRUE);
> + if (ret != I40E_SUCCESS) {
> + PMD_DRV_LOG(INFO, "Failed to flush dcb.");
> + pf->flags &= ~I40E_FLAG_DCB;
> + }
> 
> + i40e_init_queue_region_conf(dev);
> + }
>   return 0;
>  }
> 
> --
> 2.9.3



Re: [dpdk-dev] net/e1000: add pci ids for 82579LM

2018-01-13 Thread Zhang, Helin


> -Original Message-
> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Wenzhuo Lu
> Sent: Thursday, January 11, 2018 3:12 PM
> To: markus.th...@tu-ilmenau.de; dev@dpdk.org
> Subject: Re: [dpdk-dev] net/e1000: add pci ids for 82579LM
> 
> > Date: Fri, 13 Oct 2017 18:32:18 +0200
> >
> > This patch adds the PCI IDs for Intel 82579LM.
> >
> >Signed-off-by: Markus Theil 
> Thanks for the patch.
> 
> Acked-by: Wenzhuo Lu 
Applied to dpdk-next-net-intel with minor commit log changes, thanks!

/Helin


Re: [dpdk-dev] [PATCHv4 5/5] doc: Add ABI __experimental tag documentation

2018-01-13 Thread Thomas Monjalon
13/01/2018 01:28, Neil Horman:
> On Fri, Jan 12, 2018 at 03:55:10PM +, Ferruh Yigit wrote:
> > After this point agree to using EXPERIMENTAL tag in the version map as 
> > standard,
> > but it will be hard to maintain "API is experimental for first release" 
> > without
> > help of any automated tool.
> > 
> I completely agree, in fact I would say it is impossible to do without 
> tooling,
> with or without this change.  I think we need to do 1 of 2 things:
> 
> 1) Add some code to checkpatch.pl to put up a warning if any new apis are 
> added
> without marking them as experimental
> 
> 2) Change the documentation to be a suggestion rather than a requirement.
> 
> I'll look into doing (1), but I'm wondering if (2) is the more flexible way to
> go. I'm hesitant to enforce the initial marking of new APIs as experimental.
> Thoughts?

There will be always cases where we are sure that the experimental step
is not needed.
Even if it is required and checked by a tool, we can ignore it, right?
However, there is no big benefit of bypassing the experimental step.

I am for making mandatory the new API as experimental.
We will handle the exceptions case by case if any.


Re: [dpdk-dev] [PATCH 1/2] net/ixgbe: fix mailbox interrupt handler

2018-01-13 Thread Zhang, Helin


> -Original Message-
> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Qi Zhang
> Sent: Thursday, December 28, 2017 4:22 AM
> To: Lu, Wenzhuo
> Cc: dev@dpdk.org; Dai, Wei; Wang, Liang-min; Zhang, Qi Z; sta...@dpdk.org
> Subject: [dpdk-dev] [PATCH 1/2] net/ixgbe: fix mailbox interrupt handler
> 
> Mailbox interrupt handler only take care of PF reset notification, for other
> message ixgbe_read_mbx should not be called since it get chance to break the
> foreground VF to PF communication.
> This can be simply repeated by
> testpmd>rx_vlan rm all 0
> 
> Fixes: 77234603fba0 ("net/ixgbe: support VF mailbox interrupt for link
> up/down")
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Qi Zhang 
Applied to dpdk-next-net-intel, with minor commit log changes. Thanks!

/Helin


Re: [dpdk-dev] [PATCH 2/2] net/e1000: fix mailbox interrupt handler

2018-01-13 Thread Zhang, Helin


> -Original Message-
> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Qi Zhang
> Sent: Thursday, December 28, 2017 4:23 AM
> To: Lu, Wenzhuo
> Cc: dev@dpdk.org; Dai, Wei; Wang, Liang-min; Zhang, Qi Z; sta...@dpdk.org
> Subject: [dpdk-dev] [PATCH 2/2] net/e1000: fix mailbox interrupt handler
> 
> Mailbox interrupt handler only take care of the PF reset notification, for 
> other
> message mbx->ops.read should not be called since it get chance to break the
> foreground VF to PF communication.
> 
> Fixes: 316f4f1adc2e ("net/igb: support VF mailbox interrupt for link up/down")
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Qi Zhang 
Applied to dpdk-next-net-intel, with minor commit log changes. Thanks!

/Helin


Re: [dpdk-dev] DPDK-pktgen not giving expected throughput

2018-01-13 Thread Shailja Pandey

Hi,

Output of command ‘lspci | grep Ether’ is-

04:00.0 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit 
X540-AT2 (rev 01)
04:00.1 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit 
X540-AT2 (rev 01)
05:00.0 Ethernet controller: Intel Corporation Ethernet Controller XL710 for 
40GbE QSFP+ (rev 02)
05:00.1 Ethernet controller: Intel Corporation Ethernet Controller XL710 for 
40GbE QSFP+ (rev 02)

I tried running pktgen with this command
./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0x -n 4 -- -T -P -m 
“[2,4,6:8,10,12].0, [14,16,18:20,22,24].1”

but I am unable to see the ports while using comma separated cores like 2,4,6. 
However when I tried with multiple CPUs, the result is still the same.

After allocating multiple cores, I also tried the range commands to distribute 
the packets on different cores(RSS).

 
/On Jan 11, 2018, at 12:45 AM, Shailja Pandey > wrote: />//>/I have also tried with multiple cpus using the command line- />//>/./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0x -n 4 -- -T -P -m 
"[0:4].0,[5:8].1,[9-12].2,[13-16].3” /

What is port 0 and 1, I guess that port 2 and 3 are the 2x40 card. I guess I 
need the ‘lspci | grep Ether’ output too.

Lets assume port 2 and 3 are the 2 ports of 40G and the NIC is on PCI bus0 
attached to NUMA node 0.

May need to blacklist the first two port 0 and 1 to remove them from being used.

./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0x -n 4 -- -T -P -m 
“[2,4,6:8,10,12].0, [14,16,18:20,22,24].1”

Now you have 6 cores per port 3 on TX and 3 on RX on each port. You maybe be 
able to get away with 4 per port with the speed of the machine you have.

Next we have to use the range command to make sure the packets are transmitted 
with a varying set of 5 tuples.

Do a help command an look at the range commands, there is an example in the 
test directory.

To see the range information use ‘page range’ and then you can modify each port.

After you are happy with the configuration use the ‘save ’ command to save the 
configuration. Then you can use ‘load ’ or add it to the command line with the -f 
 option.




//>/On Thursday 11 January 2018 10:20 AM, Shailja Pandey wrote: />>/The command line for Pktgen is- />>//>>/./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0xfff -n 4 -- -T -P -m 
"[2].0,[4].1,[8].2,[10].3" />>//>>/Configuration of the machine is- />>//>>/Architecture: x86_64 />>/CPU op-mode(s): 32-bit, 64-bit />>/Byte Order: Little Endian />>/CPU(s): 32 />>/On-line CPU(s) list: 0-31 />>/Thread(s) per core: 2 />>/Core(s) per socket: 8 />>/Socket(s): 2 />>/NUMA node(s): 2 />>/Vendor ID: GenuineIntel />>/CPU family: 6 />>/Model: 63 />>/Model name: Intel(R) Xeon(R) CPU E5-2640 v3 @ 2.60GHz />>/Stepping: 2 />>/CPU MHz: 1204.226 />>/CPU max MHz: 3400. />>/CPU min MHz: 1200. />>/BogoMIPS: 5195.08 />>/Virtualization: VT-x />>/L1d cache: 32K />>/L1i cache: 32K />>/L2 cache: 256K />>/L3 cache: 20480K />>/NUMA node0 CPU(s): 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 />>/NUMA node1 CPU(s): 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 />>//>>/Linux XeonE5 4.4.0-93-generic />>//>>/Thanks! />>//>>//>>//>>//>>//>>/ > />>/On Jan 10, 2018, at 7:36 AM, Shailja Pandey >/ > wrote: />>//>>/ > />>//>>//>>/ > />>/Hi, />>//>>/ > />>//>>//>>/ > />>/We are performing experiments on Dell Poweredge R430 server, which is 
based on Haswell architecture based xeon-2640 v3 processor. We have 
attached XL 710 NIC(2x40 GbE) to the machine and expect 59 Mpps packet 
generation per port. But pktgen is able to generate only 21 Mpps per 
port and we are not sure about the problem. />>//>>/ > />>//>>//>>/ > />>/We are using DPDK version 16.07 and DPDK-pktgen version 3.1.0. Is there 
any way to generate packets from pktgen at 118 Mpps ? />>//>>//>>/What is the command line for Pktgen? />>//>>/Using a single CPU you should be able to generate about 21Mpps, which 
tells me the configuration is not correct. />>//>>/Pktgen needs N number of TX and RX cores to generate more traffic, plus 
the packets need to be spread across the cores using RSS. This means 
pktgen needs to be setup to generate packets with the 5 tuple difference 
to get all of the cores to receive the packets. />>//>>/I need to know the command line and the configuration of the machine to 
help. />>/ > />>//>>//>>/ > />>/-- />>//>>/ > />>//>>//>>/ > />>/Thanks, />>//>>/ > />>/Shailja />>//>>/ > />>//>>//>>//>>/Regards, />>/Keith />>//>>/-- />>//>>/Thanks, />>/Shailja />>//>//>/-- />//>/Thanks, />/Shailja />//

Regards,
Keith

--

Thanks,
Shailja



[dpdk-dev] compilation error on Suse 11 - LPM init of anon union

2018-01-13 Thread Thomas Monjalon
Hi,

There is a new compilation error since this commit in LPM:
http://dpdk.org/commit/b2e1c99
The brace has been removed because unnecessary with anonymous union.

This union is declared with RTE_STD_C11 for compatibility
with old compilers:
/** C extension macro for environments lacking C11 features. */
#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L
#define RTE_STD_C11 __extension__   
 
#else
#define RTE_STD_C11
#endif

Unfortunately, it does not work on Suse 11 SP2 with GCC 4.5.1:
lib/librte_lpm/rte_lpm.c: In function ‘add_depth_big_v20’:
lib/librte_lpm/rte_lpm.c:886:4: error:
unknown field ‘group_idx’ specified in initializer

Curiously, the error is exactly the same with ICC 16.0.2:
http://dpdk.org/ml/archives/test-report/2018-January/038443.html
Is it really using different compilers in those 2 tests?

Someone to check the value of __STDC_VERSION__ with those compilers?
gcc -dM -E -xc /dev/null | grep STDC_VERSION

Thanks for the help


Re: [dpdk-dev] DPDK-pktgen not giving expected throughput

2018-01-13 Thread Wiles, Keith


> On Jan 13, 2018, at 12:51 PM, Shailja Pandey  wrote:
> 
> Hi,
> 
> Output of command ‘lspci | grep Ether’ is-
> 
> 04:00.0 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit 
> X540-AT2 (rev 01)
> 04:00.1 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit 
> X540-AT2 (rev 01)
> 05:00.0 Ethernet controller: Intel Corporation Ethernet Controller XL710 for 
> 40GbE QSFP+ (rev 02)
> 05:00.1 Ethernet controller: Intel Corporation Ethernet Controller XL710 for 
> 40GbE QSFP+ (rev 02)
> 
> I tried running pktgen with this command 
> ./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0x -n 4 -- -T -P -m 
> “[2,4,6:8,10,12].0, [14,16,18:20,22,24].1”

Need to blacklist 04:00.0 and 04:00.1 using -b 04:00.0 -b 04:00.1 just before 
the  '--'

> 
> but I am unable to see the ports while using comma separated cores like 
> 2,4,6. However when I tried with multiple CPUs, the result is still the same.
> 
> After allocating multiple cores, I also tried the range commands to 
> distribute the packets on different cores(RSS).
> 
>  
> >
>  On Jan 11, 2018, at 12:45 AM, Shailja Pandey  > wrote:
> 
> >
>  
> 
> >
>  I have also tried with multiple cpus using the command line-
> 
> >
>  
> 
> >
>  ./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0x -n 4 -- -T -P -m 
> "[0:4].0,[5:8].1,[9-12].2,[13-16].3” 
> 
> 
> What is port 0 and 1, I guess that port 2 and 3 are the 2x40 card. I guess I 
> need the ‘lspci | grep Ether’ output too.
> 
> Lets assume port 2 and 3 are the 2 ports of 40G and the NIC is on PCI bus0 
> attached to NUMA node 0.
> 
> May need to blacklist the first two port 0 and 1 to remove them from being 
> used.
> 
> ./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0x -n 4 -- -T -P -m 
> “[2,4,6:8,10,12].0, [14,16,18:20,22,24].1”
> 
> Now you have 6 cores per port 3 on TX and 3 on RX on each port. You maybe be 
> able to get away with 4 per port with the speed of the machine you have.
> 
> Next we have to use the range command to make sure the packets are 
> transmitted with a varying set of 5 tuples.
> 
> Do a help command an look at the range commands, there is an example in the 
> test directory.
> 
> To see the range information use ‘page range’ and then you can modify each 
> port.
> 
> After you are happy with the configuration use the ‘save ’ command 
> to save the configuration. Then you can use ‘load ’ or add it to 
> the command line with the -f  option.
> 
> 
> 
> 
> >
>  
> 
> >
>  On Thursday 11 January 2018 10:20 AM, Shailja Pandey wrote:
> 
> >>
>  The command line for Pktgen is-
> 
> >>
>  
> 
> >>
>  ./app/app/x86_64-native-linuxapp-gcc/pktgen -c 0xfff -n 4 -- -T -P -m 
> "[2].0,[4].1,[8].2,[10].3"
> 
> >>
>  
> 
> >>
>  Configuration of the machine is-
> 
> >>
>  
> 
> >>
>  Architecture:  x86_64
> 
> >>
>  CPU op-mode(s):32-bit, 64-bit
> 
> >>
>  Byte Order:Little Endian
> 
> >>
>  CPU(s):32
> 
> >>
>  On-line CPU(s) list:   0-31
> 
> >>
>  Thread(s) per core:2
> 
> >>
>  Core(s) per socket:8
> 
> >>
>  Socket(s): 2
> 
> >>
>  NUMA node(s):  2
> 
> >>
>  Vendor ID: GenuineIntel
> 
> >>
>  CPU family:6
> 
> >>
>  Model: 63
> 
> >>
>  Model name:Intel(R) Xeon(R) CPU E5-2640 v3 @ 2.60GHz
> 
> >>
>  Stepping:  2
> 
> >>
>  CPU MHz:   1204.226
> 
> >>
>  CPU max MHz:   3400.
> 
> >>
>  CPU min MHz:   1200.
> 
> >>
>  BogoMIPS:  5195.08
> 
> >>
>  Virtualization:VT-x
> 
> >>
>  L1d cache: 32K
> 
> >>
>  L1i cache: 32K
> 
> >>
>  L2 cache:  256K
> 
> >>
>  L3 cache:  20480K
> 
> >>
>  NUMA node0 CPU(s): 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
> 
> >>
>  NUMA node1 CPU(s): 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
> 
> >>
>  
> 
> >>
>  Linux XeonE5 4.4.0-93-generic
> 
> >>
>  
> 
> >>
>  Thanks!
> 
> >>
>  
> 
> >>
>  
> 
> >>
>  
> 
> >>
>  
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   On Jan 10, 2018, at 7:36 AM, Shailja Pandey  
> >>
>  > wrote:
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   Hi,
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   We are performing experiments on Dell Poweredge R430 server, which is based 
> on Haswell architecture based xeon-2640 v3 processor. We have attached XL 710 
> NIC(2x40 GbE) to the machine and expect 59 Mpps packet generation per port. 
> But pktgen is able to generate only 21 Mpps per port and we are not sure 
> about the problem.
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   
> 
> >>
>  
> 
> >>
>  >
> 
> >>
>   We are using DPDK version 16.07 and DPDK-pktgen version 3.1.0. Is there any 
> way to generate packets from pktgen at 118 Mpps ?
> 
> >>
>  
> 
> >>
>  
> 
> >>
>  What is the command line for Pktgen?
> 
> >>
>  
> 
> >>
>  Using a single CPU you should be able to generate about 21Mpps, which tells 
> me the configuration 

Re: [dpdk-dev] [PATCH] eal/common: better likely() and unlikely()

2018-01-13 Thread Aleksey Baulin
This is an interesting question. Perhaps, even a philosophical one. :-)

'likely(pointer)' is a perfectly legal statement in C language, as well as
a concise one as
compared to a more explicit (and redundant/superfluous) 'likely(pointer !=
NULL)'. If you
_require_ this kind of explicitness in cases like this in the code style,
then I have no
argument here. However, I don't see that anywhere.

There're other cases of explicitness, with the most widespread being a
series of logical and
compare operations in one statement. For instance, 'if (a > b && a < c)'.
Explicitness would
require writing it like this: 'if ((a > b) && (a < c))'. I've seen cases on
this list where that was
frowned upon as it's totally unnecessary due to C operator precedence
rules, even though
those statements, I think, looked better to their authors (actually, they
do to me). Granted,
it didn't lead to compiler errors, which is the case with the current
implementation of 'likely()'.

So, my answer to the question is yes, it's to avoid making pointer
comparison explicit. I would
add though, that it is to avoid making a perfectly legal C statement an
illegal one, as with the
way the current macro is constructed, compiler emits an error when DPDK is
built. I write in C
for many years with the most time spent in kernels, Linux and not, and I
find it unnatural to
always add a redundant '!= NULL' just to satisfy the current macro
implementation. I would
have to accept that though if it's a requirement clearly stated somewhere
like a code style.

As for an extra instruction, I believe that everything in DPDK distribution
is compiled with
optimization. So the execution speed in not a concern here. Perhaps there
are cases where
it's compiled without optimization, like debugging, but then I believe it's
a non-issue.

Hope my explanations shed some more light on this patch. :-)


On Fri, Jan 12, 2018 at 6:35 PM, Thomas Monjalon 
wrote:

> 21/11/2017 08:05, Aleksey Baulin:
> > On Mon, Nov 20, 2017 at 4:36 PM, Wiles, Keith 
> wrote:
> > > > On Nov 19, 2017, at 4:16 PM, Aleksey Baulin <
> aleksey.bau...@gmail.com>
> > > wrote:
> > > > -#define unlikely(x)  __builtin_expect((x),0)
> > > > +#define unlikely(x)  __builtin_expect(!!(x), 0)
> > >
> > > I have not looked at the generated code, but does this add some extra
> > > instruction now to do the !!(x) ?
> >
> > Sorry for late response. Jim had given the correct answer already.
> > You won't get an extra instruction with compiler optimization turned on.
>
> So this patch is adding an instruction in not optimized binary.
> I don't understand the benefit.
> Is it just to avoid to make pointer comparison explicit?
> likely(pointer != NULL) looks better than likely(pointer).
>

-- 
Aleksey Baulin


Re: [dpdk-dev] [PATCH] eal/common: better likely() and unlikely()

2018-01-13 Thread Thomas Monjalon
Hi,

I moved your top-post below and did some comments inline.
More opinions are welcome.

13/01/2018 23:05, Aleksey Baulin:
> On Fri, Jan 12, 2018 at 6:35 PM, Thomas Monjalon 
> wrote:
> > 21/11/2017 08:05, Aleksey Baulin:
> > > On Mon, Nov 20, 2017 at 4:36 PM, Wiles, Keith 
> > wrote:
> > > > > On Nov 19, 2017, at 4:16 PM, Aleksey Baulin <
> > aleksey.bau...@gmail.com>
> > > > wrote:
> > > > > -#define unlikely(x)  __builtin_expect((x),0)
> > > > > +#define unlikely(x)  __builtin_expect(!!(x), 0)
> > > >
> > > > I have not looked at the generated code, but does this add some extra
> > > > instruction now to do the !!(x) ?
> > >
> > > Sorry for late response. Jim had given the correct answer already.
> > > You won't get an extra instruction with compiler optimization turned on.
> >
> > So this patch is adding an instruction in not optimized binary.
> > I don't understand the benefit.
> > Is it just to avoid to make pointer comparison explicit?
> > likely(pointer != NULL) looks better than likely(pointer).
> 
> This is an interesting question. Perhaps, even a philosophical one. :-)
> 
> 'likely(pointer)' is a perfectly legal statement in C language, as well as
> a concise one as
> compared to a more explicit (and redundant/superfluous) 'likely(pointer !=
> NULL)'. If you
> _require_ this kind of explicitness in cases like this in the code style,
> then I have no
> argument here. However, I don't see that anywhere.

It is stated here:
http://dpdk.org/doc/guides/contributing/coding_style.html#null-pointers

> There're other cases of explicitness, with the most widespread being a
> series of logical and
> compare operations in one statement. For instance, 'if (a > b && a < c)'.
> Explicitness would
> require writing it like this: 'if ((a > b) && (a < c))'. I've seen cases on
> this list where that was
> frowned upon as it's totally unnecessary due to C operator precedence
> rules, even though
> those statements, I think, looked better to their authors (actually, they
> do to me). Granted,
> it didn't lead to compiler errors, which is the case with the current
> implementation of 'likely()'.
> 
> So, my answer to the question is yes, it's to avoid making pointer
> comparison explicit. I would
> add though, that it is to avoid making a perfectly legal C statement an
> illegal one, as with the
> way the current macro is constructed, compiler emits an error when DPDK is
> built. I write in C
> for many years with the most time spent in kernels, Linux and not, and I
> find it unnatural to
> always add a redundant '!= NULL' just to satisfy the current macro
> implementation. I would
> have to accept that though if it's a requirement clearly stated somewhere
> like a code style.
> 
> As for an extra instruction, I believe that everything in DPDK distribution
> is compiled with
> optimization. So the execution speed in not a concern here. Perhaps there
> are cases where
> it's compiled without optimization, like debugging, but then I believe it's
> a non-issue.

Yes you're right about optimization.
But can we be 100% sure that it is always well optimized?

> Hope my explanations shed some more light on this patch. :-)

If we can be sure that there is no cost on optimized code,
I am not against this patch.
It may be especially useful when not complying to the DPDK
coding rules, like in applications.


Re: [dpdk-dev] [PATCH] eal/common: better likely() and unlikely()

2018-01-13 Thread Aleksey Baulin
Please see my comments inline.

On Sun, Jan 14, 2018 at 1:24 AM, Thomas Monjalon 
wrote:

> Hi,
>
> I moved your top-post below and did some comments inline.
> More opinions are welcome.
>
> 13/01/2018 23:05, Aleksey Baulin:
> > On Fri, Jan 12, 2018 at 6:35 PM, Thomas Monjalon 
> > wrote:
> > > 21/11/2017 08:05, Aleksey Baulin:
> > > > On Mon, Nov 20, 2017 at 4:36 PM, Wiles, Keith  >
> > > wrote:
> > > > > > On Nov 19, 2017, at 4:16 PM, Aleksey Baulin <
> > > aleksey.bau...@gmail.com>
> > > > > wrote:
> > > > > > -#define unlikely(x)  __builtin_expect((x),0)
> > > > > > +#define unlikely(x)  __builtin_expect(!!(x), 0)
> > > > >
> > > > > I have not looked at the generated code, but does this add some
> extra
> > > > > instruction now to do the !!(x) ?
> > > >
> > > > Sorry for late response. Jim had given the correct answer already.
> > > > You won't get an extra instruction with compiler optimization turned
> on.
> > >
> > > So this patch is adding an instruction in not optimized binary.
> > > I don't understand the benefit.
> > > Is it just to avoid to make pointer comparison explicit?
> > > likely(pointer != NULL) looks better than likely(pointer).
> >
> > This is an interesting question. Perhaps, even a philosophical one. :-)
> >
> > 'likely(pointer)' is a perfectly legal statement in C language, as well
> as
> > a concise one as
> > compared to a more explicit (and redundant/superfluous) 'likely(pointer
> !=
> > NULL)'. If you
> > _require_ this kind of explicitness in cases like this in the code style,
> > then I have no
> > argument here. However, I don't see that anywhere.
>
> It is stated here:
> http://dpdk.org/doc/guides/contributing/coding_style.
> html#null-pointers


​Oh, thanks for pointing that out! I am sincerely ashamed for missing it.​
I lose that argument as I certainly do submit to the coding style. My only
excuse is that I am actually developing an app and not the DPDK core.


> > There're other cases of explicitness, with the most widespread being a
> > series of logical and
> > compare operations in one statement. For instance, 'if (a > b && a < c)'.
> > Explicitness would
> > require writing it like this: 'if ((a > b) && (a < c))'. I've seen cases
> on
> > this list where that was
> > frowned upon as it's totally unnecessary due to C operator precedence
> > rules, even though
> > those statements, I think, looked better to their authors (actually, they
> > do to me). Granted,
> > it didn't lead to compiler errors, which is the case with the current
> > implementation of 'likely()'.
> >
> > So, my answer to the question is yes, it's to avoid making pointer
> > comparison explicit. I would
> > add though, that it is to avoid making a perfectly legal C statement an
> > illegal one, as with the
> > way the current macro is constructed, compiler emits an error when DPDK
> is
> > built. I write in C
> > for many years with the most time spent in kernels, Linux and not, and I
> > find it unnatural to
> > always add a redundant '!= NULL' just to satisfy the current macro
> > implementation. I would
> > have to accept that though if it's a requirement clearly stated somewhere
> > like a code style.
> >
> > As for an extra instruction, I believe that everything in DPDK
> distribution
> > is compiled with
> > optimization. So the execution speed in not a concern here. Perhaps there
> > are cases where
> > it's compiled without optimization, like debugging, but then I believe
> it's
> > a non-issue.
>
> Yes you're right about optimization.
> But can we be 100% sure that it is always well optimized?
>

​I believe we can. I hope we get other opinions as well.​

> Hope my explanations shed some more light on this patch. :-)
>
> If we can be sure that there is no cost on optimized code,
> I am not against this patch.
> It may be especially useful when not complying to the DPDK
> coding rules, like in applications.
>

​Yes, that's exactly my case. Thanks.​

-- 
Aleksey Baulin


Re: [dpdk-dev] [PATCH v2] vfio: noiommu check error handling

2018-01-13 Thread Thomas Monjalon
13/01/2018 13:15, Burakov, Anatoly:
> On 11-Jan-18 11:45 PM, Thomas Monjalon wrote:
> > 07/11/2017 10:50, Jonas Pfefferle1:
> >>> Is there something urgent for 17.11?
> >>> Or can it be refined in 18.02?
> >>
> >> Nothing urgent. We can refine this for 18.02.
> >>
> >>> Anatoly, any thought?
> > 
> > Anatoly, Jonas, how do you want to proceed with this patch?
> > 
> 
> I don't see anything to be refined here, it's a simple bug fix - code 
> assumes noiommu mode support is always available, when it might not be 
> the case on older kernels.

As a bug fix, the title must start with "fix" and a tag "Fixes:"
must be added to help with backport.
At the same time, the explanation of the bug must be added in
the commit log please.

Thanks


Re: [dpdk-dev] [RFC v2 3/5] ether: Add flow timeout support

2018-01-13 Thread Zhang, Qi Z
Hi Alex & Keith

Base on my further understanding about OVS requirement and the new 
device's capability.
I realize there is no strong point to have the timeout APIs from this 
patch, I'd like to withdraw it.
Thanks for all your comments that help me to think it over.

Regards
Qi

> -Original Message-
> From: Wiles, Keith
> Sent: Friday, December 22, 2017 10:06 PM
> To: Zhang, Qi Z 
> Cc: Alex Rosenbaum ;
> adrien.mazarg...@6wind.com; DPDK ; Doherty, Declan
> 
> Subject: Re: [dpdk-dev] [RFC v2 3/5] ether: Add flow timeout support
> 
> 
> 
> > On Dec 22, 2017, at 3:03 AM, Zhang, Qi Z  wrote:
> >
> > Alex:
> >
> >> -Original Message-
> >> From: Alex Rosenbaum [mailto:rosenbauma...@gmail.com]
> >> Sent: Thursday, December 21, 2017 9:59 PM
> >> To: Zhang, Qi Z 
> >> Cc: adrien.mazarg...@6wind.com; DPDK ; Doherty, Declan
> >> 
> >> Subject: Re: [dpdk-dev] [RFC v2 3/5] ether: Add flow timeout support
> >>
> >> On Thu, Dec 21, 2017 at 4:35 AM, Qi Zhang  wrote:
> >>> Add new APIs to support flow timeout, application is able to 1.
> >>> Setup the time duration of a flow, the flow is expected to be
> >>> deleted automatically when timeout.
> >>
> >> Can you explain how the application (OVS) is expected to use this API?
> >> It will help to better understand the motivation here...
> >
> > I think the purpose of the APIs is to expose the hardware feature that
> > support flow auto delete with a timeout.
> > As I know, for OVS, every flow in flow table will have time duration A
> > flow be offloaded to hardware is still required to be deleted in
> > specific time, I think these APIs help OVS to take advantage HW
> > feature and simplify the flow aging management
> >
> >>
> >> Are you trying to move the aging timer from application code into the PMD?
> >> or can your HW remove/disable/inactivate a flow at certain time
> >> semantics without software context?
> >
> > Yes, it for hardware feature.
> 
> We also need to support a software timeout feature here and not just a
> hardware one. The reason is to make the APIs consistent across all hardware. 
> If
> you are going to include hardware timeout then we need to add software
> supported timeout at the same time IMO.
> 
> >
> >>
> >> I would prefer to have the aging timer logic in a centralized
> >> location, leek the application itself or some DPDK library. instead
> >> of having each PMD implement its own software timers.
> >>
> >>
> >>> 3. Register a callback function when a flow is deleted due to timeout.
> >>
> >> Is the application 'struct rte_flow*' handle really deleted? or the
> >> flow was removed from HW, just in-active at this time?
> >
> > Here the flow is deleted, same thing happen as rte_flow_destroy and we
> > need to call rte_flow_create to re-enable the flow.
> > I will add more explanation to avoid confusion in next release.
> 
> Sorry, I little late into this thread, but we can not have 1000 callbacks for 
> each
> timeout and we need make sure we bunch up a number of timeouts at a time to
> make the feature more performant IMO. Maybe that discussed or address in
> the code.
> 
> >
> >>
> >> Can a flow be re-activated? or does this require a call to
> >> rte_flow_destory() and ret_flow_create()?
> >>
> >> Alex
> >
> > Thanks
> > Qi
> 
> Regards,
> Keith



Re: [dpdk-dev] [PATCH v2 1/4] net/ixgbevf: unregister irq handler when other interrupts not allowed.

2018-01-13 Thread Tonghao Zhang
On Fri, Jan 12, 2018 at 4:39 PM, Dai, Wei  wrote:
> Hi, Tonghao & Beilei
>
> The issue reported by Tonghao is caused by shortage of igb_uio.
> If you want to use Rx queue interrupt in your DPDK application,
> I suggest use VFIO-PCI to bind NIC port instead of igb_uio.
The performance of igb_uio is better than vfio-pic. So igb_uio is
a better solution. And the PF binded with igb_uio can use the rx queue
interrupt,
because it unregister the irq handler in eal-intr-thread via
rte_intr_callback_unregister.
Then the PF will also not handler the irq (mailbox) from VF, right ?

To support the feature of rx queue interrupt , the PF and VF binded
with igb_uio may unregister the other  irq.

> Currently igb_uio only support single event fd. This fd is triggered by
> both miscellaneous and RX queue interrupt in ixgbe VF.
> But same event fd is added to epoll fd of eal-intr-thread
> and also epoll fd of normal thread for Rx task, this cause the issue.
Yes, it's right

> If VFIO-PCI is used, different event fd for misc and Rx queue are created.
> By the way, your patch also lead to missing of PF to VF reset interrupt.


> So, I am sorry that I can't agree it now.
>
>
>> -Original Message-
>> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of
>> xiangxia.m@gmail.com
>> Sent: Friday, January 12, 2018 12:40 AM
>> To: Xing, Beilei ; dev@dpdk.org
>> Cc: Tonghao Zhang 
>> Subject: [dpdk-dev] [PATCH v2 1/4] net/ixgbevf: unregister irq handler when
>> other interrupts not allowed.
>>
>> From: Tonghao Zhang 
>>
>> When bind the ixgbe VF (e.g 82599 card) to igb_uio and enable the
>> rx-interrupt, there will be more than one epoll_wait on intr_handle.fd.
>> One is in "eal-intr-thread" thread, and the others are in the thread which 
>> call
>> the "rte_epoll_wait". The problem is that sometimes "eal-intr-thread" thread
>> will process the rx interrupt, and then rte_epoll_wait can't get the event
>> anymore, and the packets may be lost.
>>
>> The patch unregister the status interrupt handler in "eal-intr-thread"
>> thread and the ixgbe pf is in the same case.
>>
>> Signed-off-by: Tonghao Zhang 
>> Acked-by: Beilei Xing 
>> ---
>>  drivers/net/ixgbe/ixgbe_ethdev.c | 15 +++
>>  1 file changed, 15 insertions(+)
>>
>> diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c
>> b/drivers/net/ixgbe/ixgbe_ethdev.c
>> index ff19a56..0e056a2 100644
>> --- a/drivers/net/ixgbe/ixgbe_ethdev.c
>> +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
>> @@ -5078,6 +5078,15 @@ static int
>> ixgbevf_dev_xstats_get_names(__rte_unused struct rte_eth_dev *dev,
>>   }
>>   ixgbevf_configure_msix(dev);
>>
>> + if (!rte_intr_allow_others(intr_handle)) {
>> + rte_intr_callback_unregister(intr_handle,
>> +  ixgbevf_dev_interrupt_handler,
>> +  dev);
>> + if (dev->data->dev_conf.intr_conf.lsc != 0)
>> + PMD_INIT_LOG(INFO, "lsc won't enable because of"
>> +  " no intr multiplex");
>> + }
>> +
>>   /* When a VF port is bound to VFIO-PCI, only miscellaneous interrupt
>>* is mapped to VFIO vector 0 in eth_ixgbevf_dev_init( ).
>>* If previous VFIO interrupt mapping setting in eth_ixgbevf_dev_init( 
>> )
>> @@ -5120,6 +5129,12 @@ static int
>> ixgbevf_dev_xstats_get_names(__rte_unused struct rte_eth_dev *dev,
>>
>>   ixgbe_dev_clear_queues(dev);
>>
>> + if (!rte_intr_allow_others(intr_handle))
>> + /* resume to the default handler */
>> + rte_intr_callback_register(intr_handle,
>> +ixgbevf_dev_interrupt_handler,
>> +(void *)dev);
>> +
>>   /* Clean datapath event and queue/vec mapping */
>>   rte_intr_efd_disable(intr_handle);
>>   if (intr_handle->intr_vec != NULL) {
>> --
>> 1.8.3.1
>


Re: [dpdk-dev] [PATCH v2] net/i40e: fix packet type parser issue

2018-01-13 Thread Zhang, Qi Z


> -Original Message-
> From: Xing, Beilei
> Sent: Friday, January 12, 2018 4:42 PM
> To: Zhang, Qi Z 
> Cc: dev@dpdk.org; sta...@dpdk.org
> Subject: [PATCH v2] net/i40e: fix packet type parser issue
> 
> Ptype mapping table will fail to update when loading PPP profile, fix the 
> issue
> via modifying metadata and adding check.
> This patch also adds parser for IPV4FRAG and IPV6FRAG.
> 
> Fixes: ab2e350c4f4b ("net/i40e: improve packet type parser")
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Beilei Xing 
> ---
> 
> v2 changes:
>  - Add parser for IPV4FRAG and IPV6FRAG.
> 
>  drivers/net/i40e/i40e_ethdev.c  | 44
> ++---
>  drivers/net/i40e/rte_pmd_i40e.c |  6 --
>  2 files changed, 32 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
> index d80671a..69704e3 100644
> --- a/drivers/net/i40e/i40e_ethdev.c
> +++ b/drivers/net/i40e/i40e_ethdev.c
> @@ -11283,43 +11283,55 @@ i40e_update_customized_ptype(struct
> rte_eth_dev *dev, uint8_t *pkg,
>   continue;
>   memset(name, 0, sizeof(name));
>   strcpy(name, proto[n].name);
> - if (!strncmp(name, "PPPOE", 5))
> + if (!strncmp(name, "PPPoE", 5))
>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_L2_ETHER_PPPOE;
> - else if (!strncmp(name, "OIPV4", 5)) {
> + else if (!strncmp(name, "IPV4FRAG", 8) &&
> +  !in_tunnel) {
>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
> - in_tunnel = true;
> - } else if (!strncmp(name, "IPV4", 4) &&
> -!in_tunnel)
>   ptype_mapping[i].sw_ptype |=
> - RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
> - else if (!strncmp(name, "IPV4FRAG", 8) &&
> -  in_tunnel) {
> + RTE_PTYPE_L4_FRAG;
> + } else if (!strncmp(name, "IPV4FRAG", 8) &&
> +in_tunnel) {
>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN;
>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_INNER_L4_FRAG;
>   } else if (!strncmp(name, "IPV4", 4) &&
> +!in_tunnel)
> + ptype_mapping[i].sw_ptype |=
> + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
> + else if (!strncmp(name, "OIPV4", 5)) {
> + ptype_mapping[i].sw_ptype |=
> + RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
> + in_tunnel = true;
> + } else if (!strncmp(name, "IPV4", 4) &&
>  in_tunnel)
Since you are doing some reordering here, it's better to move IPV4 && in_tunnel 
case above OIPV4, to match IPV4 && !in_tunnel case.

Regards
Qi

>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN;
> - else if (!strncmp(name, "OIPV6", 5)) {
> + else if (!strncmp(name, "IPV6FRAG", 8) &&
> +  !in_tunnel) {
>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
> - in_tunnel = true;
> - } else if (!strncmp(name, "IPV6", 4) &&
> -!in_tunnel)
>   ptype_mapping[i].sw_ptype |=
> - RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
> - else if (!strncmp(name, "IPV6FRAG", 8) &&
> -  in_tunnel) {
> + RTE_PTYPE_L4_FRAG;
> + } else if (!strncmp(name, "IPV6FRAG", 8) &&
> +in_tunnel) {
>   ptype_mapping[i].sw_ptype |=
>   RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN;
>   ptype_mapping[i].sw_

Re: [dpdk-dev] [PATCH 4/5] net/ixgbevf: add check for rte_intr_enable.

2018-01-13 Thread Tonghao Zhang
On Fri, Jan 12, 2018 at 8:10 PM, Dai, Wei  wrote:
> Hi, Tonghao
> Thanks for your patch.
> It looks that same change can be applied to ixgbe_dev_interrupt_action( )
> and ixgbe_dev_interrupt_delayed_handler( ).
Yes, but the irq (e.g mailbox irq) is not  same as rx interrupt
handled frequently.
and it will not affect the performance.

 > Anyway, you can test all these changes with example/l3fwd-power.
> -Wei
I tested them with  example/l3fwd-power and our apps. It is ok.
I will send v3 to you.

>
>> -Original Message-
>> From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Xing, Beilei
>> Sent: Thursday, January 11, 2018 3:06 PM
>> To: Tonghao Zhang ; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH 4/5] net/ixgbevf: add check for
>> rte_intr_enable.
>>
>>
>>
>> > -Original Message-
>> > From: dev [mailto:dev-boun...@dpdk.org] On Behalf Of Tonghao Zhang
>> > Sent: Friday, January 5, 2018 10:11 PM
>> > To: dev@dpdk.org
>> > Cc: Tonghao Zhang 
>> > Subject: [dpdk-dev] [PATCH 4/5] net/ixgbevf: add check for
>> rte_intr_enable.
>>
>> The patch is not only for ixgbevf, but also for ixgbe, right?
>> so how about changing the title with net/ixgbe started?
>>
>> > When we bind the ixgbevf to vfio and call the
>> > rte_eth_dev_rx_intr_enable and rte_eth_dev_rx_intr_disable frequently,
>> > the interrupt setting
>> > (msi_set_mask_bit) will take more CPU as show below. rte_intr_enable
>> > call the ioctl to map the fd to interrupts frequently.
>> >
>> > perf top:
>> > 5.45%  [kernel]   [k] msi_set_mask_bit
>> >
>> > It is unnecessary to call the rte_intr_enable in
>> > ixgbe_dev_rx_queue_intr_enable. because the fds has been mapped to
>> > interrupt and not unmapped in ixgbe_dev_rx_queue_intr_disable.
>> >
>> > This patch add checks for using VFIO.  With the patch,
>> > msi_set_mask_bit is not listed in perl any more. Any suggestion will be
>> welcome.
>> >
>> > Signed-off-by: Tonghao Zhang 
>> > ---
>> >  drivers/net/ixgbe/ixgbe_ethdev.c | 9 +++--
>> >  1 file changed, 7 insertions(+), 2 deletions(-)
>> >
>> > diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c
>> > b/drivers/net/ixgbe/ixgbe_ethdev.c
>> > index e929235..79e4097 100644
>> > --- a/drivers/net/ixgbe/ixgbe_ethdev.c
>> > +++ b/drivers/net/ixgbe/ixgbe_ethdev.c
>> > @@ -5610,7 +5610,9 @@ static void ixgbevf_set_vfta_all(struct
>> > rte_eth_dev *dev, bool on)
>> > RTE_SET_USED(queue_id);
>> > IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, intr->mask);
>> >
>> > -   rte_intr_enable(intr_handle);
>> > +   if (intr_handle->type == RTE_INTR_HANDLE_UIO ||
>> > +   intr_handle->type == RTE_INTR_HANDLE_UIO_INTX)
>> > +   rte_intr_enable(intr_handle);
>>
>> For igb_uio, did you check if it's necessary to call rte_intr_enable every 
>> time?
>> Since rte interrupt is not disabled during ixgbevf_dev_rx_queue_intr_disable.
>>
>> >
>> > return 0;
>> >  }
>> > @@ -5659,7 +5661,10 @@ static void ixgbevf_set_vfta_all(struct
>> > rte_eth_dev *dev, bool on)
>> > mask &= (1 << (queue_id - 32));
>> > IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
>> > }
>> > -   rte_intr_enable(intr_handle);
>> > +
>> > +   if (intr_handle->type == RTE_INTR_HANDLE_UIO ||
>> > +   intr_handle->type == RTE_INTR_HANDLE_UIO_INTX)
>> > +   rte_intr_enable(intr_handle);
>>
>> The same comment as above.
>>
>> >
>> > return 0;
>> >  }
>> > --
>> > 1.8.3.1
>