from:"Zhihong Wang"

[dpdk-dev] [PATCH] app/testpmd: flowgen support ip and udp fields

2021-08-08 Thread Zhihong Wang

This patch aims to:
 1. Add flexibility by supporting IP & UDP src/dst fields
 2. Improve multi-core performance by using per-core vars

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 137 +++--
 1 file changed, 86 insertions(+), 51 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 3bf6e1ce97..5b389165bc 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -40,41 +40,37 @@
 
 #include "testpmd.h"
 
-/* hardcoded configuration (for now) */
-static unsigned cfg_n_flows= 1024;
-static uint32_t cfg_ip_src = RTE_IPV4(10, 254, 0, 0);
-static uint32_t cfg_ip_dst = RTE_IPV4(10, 253, 0, 0);
-static uint16_t cfg_udp_src= 1000;
-static uint16_t cfg_udp_dst= 1001;
+/*
+ * Hardcoded range for flow generation.
+ *
+ * Total number of flows =
+ * cfg_n_ip_src * cfg_n_ip_dst * cfg_n_udp_src * cfg_n_udp_dst
+ */
+static uint32_t cfg_n_ip_src = 100;
+static uint32_t cfg_n_ip_dst = 100;
+static uint32_t cfg_n_udp_src = 10;
+static uint32_t cfg_n_udp_dst = 10;
+
+/* Base ip and port for flow generation. */
+static uint32_t cfg_ip_src_base = RTE_IPV4(10, 254, 0, 0);
+static uint32_t cfg_ip_dst_base = RTE_IPV4(10, 253, 0, 0);
+static uint16_t cfg_udp_src_base = 1000;
+static uint16_t cfg_udp_dst_base = 1001;
 static struct rte_ether_addr cfg_ether_src =
{{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x00 }};
 static struct rte_ether_addr cfg_ether_dst =
{{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x01 }};
 
+RTE_DEFINE_PER_LCORE(uint32_t, _next_ip_src);
+RTE_DEFINE_PER_LCORE(uint32_t, _next_ip_dst);
+RTE_DEFINE_PER_LCORE(uint32_t, _next_udp_src);
+RTE_DEFINE_PER_LCORE(uint32_t, _next_udp_dst);
+
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
 /* Use this type to inform GCC that ip_sum violates aliasing rules. */
 typedef unaligned_uint16_t alias_int16_t __attribute__((__may_alias__));
 
-static inline uint16_t
-ip_sum(const alias_int16_t *hdr, int hdr_len)
-{
-   uint32_t sum = 0;
-
-   while (hdr_len > 1)
-   {
-   sum += *hdr++;
-   if (sum & 0x8000)
-   sum = (sum & 0x) + (sum >> 16);
-   hdr_len -= 2;
-   }
-
-   while (sum >> 16)
-   sum = (sum & 0x) + (sum >> 16);
-
-   return ~sum;
-}
-
 /*
  * Multi-flow generation mode.
  *
@@ -85,7 +81,7 @@ ip_sum(const alias_int16_t *hdr, int hdr_len)
 static void
 pkt_burst_flow_gen(struct fwd_stream *fs)
 {
-   unsigned pkt_size = tx_pkt_length - 4;  /* Adjust FCS */
+   uint32_t pkt_size = tx_pkt_length - 4; /* Adjust FCS */
struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
struct rte_mempool *mbp;
struct rte_mbuf  *pkt = NULL;
@@ -102,15 +98,18 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
uint32_t retry;
uint64_t tx_offloads;
uint64_t start_tsc = 0;
-   static int next_flow = 0;
+   uint32_t next_ip_src = RTE_PER_LCORE(_next_ip_src);
+   uint32_t next_ip_dst = RTE_PER_LCORE(_next_ip_dst);
+   uint32_t next_udp_src = RTE_PER_LCORE(_next_udp_src);
+   uint32_t next_udp_dst = RTE_PER_LCORE(_next_udp_dst);
 
get_start_cycles(&start_tsc);
 
/* Receive a burst of packets and discard them. */
nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 nb_pkt_per_burst);
+   inc_rx_burst_stats(fs, nb_rx);
fs->rx_packets += nb_rx;
-
for (i = 0; i < nb_rx; i++)
rte_pktmbuf_free(pkts_burst[i]);
 
@@ -144,7 +143,8 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
rte_ether_addr_copy(&cfg_ether_dst, ð_hdr->d_addr);
rte_ether_addr_copy(&cfg_ether_src, ð_hdr->s_addr);
-   eth_hdr->ether_type = 
rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+   eth_hdr->ether_type =
+   rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
 
/* Initialize IP header. */
ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
@@ -155,22 +155,30 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
ip_hdr->time_to_live= IP_DEFTTL;
ip_hdr->next_proto_id   = IPPROTO_UDP;
ip_hdr->packet_id   = 0;
-   ip_hdr->src_addr= rte_cpu_to_be_32(cfg_ip_src);
-   ip_hdr->dst_addr= rte_cpu_to_be_32(cfg_ip_dst +
-  next_flow);
-   ip_hdr->total_length= RTE_CPU_TO_BE_16(pkt_size -
-  
sizeof(*eth_hdr));
-   ip_hdr->hdr_checksum= ip_sum

[dpdk-dev] [PATCH v2] app/testpmd: flowgen support ip and udp fields

2021-08-08 Thread Zhihong Wang

This patch aims to:
 1. Add flexibility by supporting IP & UDP src/dst fields
 2. Improve multi-core performance by using per-core vars

v2: fix assigning ip header cksum

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 137 +++--
 1 file changed, 86 insertions(+), 51 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 3bf6e1ce97..333c3b2cd2 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -40,41 +40,37 @@
 
 #include "testpmd.h"
 
-/* hardcoded configuration (for now) */
-static unsigned cfg_n_flows= 1024;
-static uint32_t cfg_ip_src = RTE_IPV4(10, 254, 0, 0);
-static uint32_t cfg_ip_dst = RTE_IPV4(10, 253, 0, 0);
-static uint16_t cfg_udp_src= 1000;
-static uint16_t cfg_udp_dst= 1001;
+/*
+ * Hardcoded range for flow generation.
+ *
+ * Total number of flows =
+ * cfg_n_ip_src * cfg_n_ip_dst * cfg_n_udp_src * cfg_n_udp_dst
+ */
+static uint32_t cfg_n_ip_src = 100;
+static uint32_t cfg_n_ip_dst = 100;
+static uint32_t cfg_n_udp_src = 10;
+static uint32_t cfg_n_udp_dst = 10;
+
+/* Base ip and port for flow generation. */
+static uint32_t cfg_ip_src_base = RTE_IPV4(10, 254, 0, 0);
+static uint32_t cfg_ip_dst_base = RTE_IPV4(10, 253, 0, 0);
+static uint16_t cfg_udp_src_base = 1000;
+static uint16_t cfg_udp_dst_base = 1001;
 static struct rte_ether_addr cfg_ether_src =
{{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x00 }};
 static struct rte_ether_addr cfg_ether_dst =
{{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x01 }};
 
+RTE_DEFINE_PER_LCORE(uint32_t, _next_ip_src);
+RTE_DEFINE_PER_LCORE(uint32_t, _next_ip_dst);
+RTE_DEFINE_PER_LCORE(uint32_t, _next_udp_src);
+RTE_DEFINE_PER_LCORE(uint32_t, _next_udp_dst);
+
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
 /* Use this type to inform GCC that ip_sum violates aliasing rules. */
 typedef unaligned_uint16_t alias_int16_t __attribute__((__may_alias__));
 
-static inline uint16_t
-ip_sum(const alias_int16_t *hdr, int hdr_len)
-{
-   uint32_t sum = 0;
-
-   while (hdr_len > 1)
-   {
-   sum += *hdr++;
-   if (sum & 0x8000)
-   sum = (sum & 0x) + (sum >> 16);
-   hdr_len -= 2;
-   }
-
-   while (sum >> 16)
-   sum = (sum & 0x) + (sum >> 16);
-
-   return ~sum;
-}
-
 /*
  * Multi-flow generation mode.
  *
@@ -85,7 +81,7 @@ ip_sum(const alias_int16_t *hdr, int hdr_len)
 static void
 pkt_burst_flow_gen(struct fwd_stream *fs)
 {
-   unsigned pkt_size = tx_pkt_length - 4;  /* Adjust FCS */
+   uint32_t pkt_size = tx_pkt_length - 4; /* Adjust FCS */
struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
struct rte_mempool *mbp;
struct rte_mbuf  *pkt = NULL;
@@ -102,15 +98,18 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
uint32_t retry;
uint64_t tx_offloads;
uint64_t start_tsc = 0;
-   static int next_flow = 0;
+   uint32_t next_ip_src = RTE_PER_LCORE(_next_ip_src);
+   uint32_t next_ip_dst = RTE_PER_LCORE(_next_ip_dst);
+   uint32_t next_udp_src = RTE_PER_LCORE(_next_udp_src);
+   uint32_t next_udp_dst = RTE_PER_LCORE(_next_udp_dst);
 
get_start_cycles(&start_tsc);
 
/* Receive a burst of packets and discard them. */
nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 nb_pkt_per_burst);
+   inc_rx_burst_stats(fs, nb_rx);
fs->rx_packets += nb_rx;
-
for (i = 0; i < nb_rx; i++)
rte_pktmbuf_free(pkts_burst[i]);
 
@@ -144,7 +143,8 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
rte_ether_addr_copy(&cfg_ether_dst, ð_hdr->d_addr);
rte_ether_addr_copy(&cfg_ether_src, ð_hdr->s_addr);
-   eth_hdr->ether_type = 
rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+   eth_hdr->ether_type =
+   rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
 
/* Initialize IP header. */
ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
@@ -155,22 +155,30 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
ip_hdr->time_to_live= IP_DEFTTL;
ip_hdr->next_proto_id   = IPPROTO_UDP;
ip_hdr->packet_id   = 0;
-   ip_hdr->src_addr= rte_cpu_to_be_32(cfg_ip_src);
-   ip_hdr->dst_addr= rte_cpu_to_be_32(cfg_ip_dst +
-  next_flow);
-   ip_hdr->total_length= RTE_CPU_TO_BE_16(pkt_size -
-  
sizeo

[dpdk-dev] [PATCH v3 0/4] app/testpmd: flowgen fixes and improvements

2021-08-12 Thread Zhihong Wang

This series fixes a tx retry defect and improves multi-core performance
by using per-core variable for flow indexing.

v3: split changes and keep original flow generation logic
v2: fix assigning ip header cksum

Zhihong Wang (4):
  app/testpmd: fix tx retry in flowgen
  app/testpmd: use rte_ipv4_cksum in flowgen
  app/testpmd: record rx_burst and fwd_dropped in flowgen
  app/testpmd: use per-core variable in flowgen

 app/test-pmd/flowgen.c | 47 ++-
 1 file changed, 14 insertions(+), 33 deletions(-)

-- 
2.11.0

[dpdk-dev] [PATCH v3 1/4] app/testpmd: fix tx retry in flowgen

2021-08-12 Thread Zhihong Wang

Fix tx_pkt number in tx retry logic.

Fixes: bf56fce1fb4 ("app/testpmd: add retry option")
Cc: sta...@dpdk.org

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 3bf6e1ce97..f2e6255c36 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -192,12 +192,12 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
/*
 * Retry if necessary
 */
-   if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
+   if (unlikely(nb_tx < nb_pkt) && fs->retry_enabled) {
retry = 0;
-   while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+   while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
rte_delay_us(burst_tx_delay_time);
nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
-   &pkts_burst[nb_tx], nb_rx - nb_tx);
+   &pkts_burst[nb_tx], nb_pkt - nb_tx);
}
}
fs->tx_packets += nb_tx;
-- 
2.11.0

[dpdk-dev] [PATCH v3 2/4] app/testpmd: use rte_ipv4_cksum in flowgen

2021-08-12 Thread Zhihong Wang

Use the rte_ipv4_cksum API to replace local ip_sum implementation.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 25 +
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index f2e6255c36..96d0cc79df 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -53,28 +53,6 @@ static struct rte_ether_addr cfg_ether_dst =
 
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
-/* Use this type to inform GCC that ip_sum violates aliasing rules. */
-typedef unaligned_uint16_t alias_int16_t __attribute__((__may_alias__));
-
-static inline uint16_t
-ip_sum(const alias_int16_t *hdr, int hdr_len)
-{
-   uint32_t sum = 0;
-
-   while (hdr_len > 1)
-   {
-   sum += *hdr++;
-   if (sum & 0x8000)
-   sum = (sum & 0x) + (sum >> 16);
-   hdr_len -= 2;
-   }
-
-   while (sum >> 16)
-   sum = (sum & 0x) + (sum >> 16);
-
-   return ~sum;
-}
-
 /*
  * Multi-flow generation mode.
  *
@@ -160,8 +138,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
   next_flow);
ip_hdr->total_length= RTE_CPU_TO_BE_16(pkt_size -
   
sizeof(*eth_hdr));
-   ip_hdr->hdr_checksum= ip_sum((const alias_int16_t 
*)ip_hdr,
-sizeof(*ip_hdr));
+   ip_hdr->hdr_checksum= rte_ipv4_cksum(ip_hdr);
 
/* Initialize UDP header. */
udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
-- 
2.11.0

[dpdk-dev] [PATCH v3 3/4] app/testpmd: record rx_burst and fwd_dropped in flowgen

2021-08-12 Thread Zhihong Wang

Call inc_rx_burst_stats for rx operation, and record fwd_dropped.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 96d0cc79df..229794ee9c 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -87,6 +87,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
/* Receive a burst of packets and discard them. */
nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 nb_pkt_per_burst);
+   inc_rx_burst_stats(fs, nb_rx);
fs->rx_packets += nb_rx;
 
for (i = 0; i < nb_rx; i++)
@@ -186,6 +187,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
while (next_flow < 0)
next_flow += cfg_n_flows;
 
+   fs->fwd_dropped += nb_pkt - nb_tx;
do {
rte_pktmbuf_free(pkts_burst[nb_tx]);
} while (++nb_tx < nb_pkt);
-- 
2.11.0

[dpdk-dev] [PATCH v3 4/4] app/testpmd: use per-core variable in flowgen

2021-08-12 Thread Zhihong Wang

Use per-core variable for flow indexing to solve cache contention in
multi-core scenarios.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 229794ee9c..fc9dae4ab3 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -51,6 +51,8 @@ static struct rte_ether_addr cfg_ether_src =
 static struct rte_ether_addr cfg_ether_dst =
{{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x01 }};
 
+RTE_DEFINE_PER_LCORE(int, _next_flow);
+
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
 /*
@@ -80,7 +82,6 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
uint32_t retry;
uint64_t tx_offloads;
uint64_t start_tsc = 0;
-   static int next_flow = 0;
 
get_start_cycles(&start_tsc);
 
@@ -136,7 +137,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
ip_hdr->packet_id   = 0;
ip_hdr->src_addr= rte_cpu_to_be_32(cfg_ip_src);
ip_hdr->dst_addr= rte_cpu_to_be_32(cfg_ip_dst +
-  next_flow);
+   RTE_PER_LCORE(_next_flow));
ip_hdr->total_length= RTE_CPU_TO_BE_16(pkt_size -
   
sizeof(*eth_hdr));
ip_hdr->hdr_checksum= rte_ipv4_cksum(ip_hdr);
@@ -163,7 +164,8 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
}
pkts_burst[nb_pkt] = pkt;
 
-   next_flow = (next_flow + 1) % cfg_n_flows;
+   RTE_PER_LCORE(_next_flow) = (RTE_PER_LCORE(_next_flow) + 1) %
+   cfg_n_flows;
}
 
nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
@@ -183,9 +185,9 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
inc_tx_burst_stats(fs, nb_tx);
if (unlikely(nb_tx < nb_pkt)) {
/* Back out the flow counter. */
-   next_flow -= (nb_pkt - nb_tx);
-   while (next_flow < 0)
-   next_flow += cfg_n_flows;
+   RTE_PER_LCORE(_next_flow) -= (nb_pkt - nb_tx);
+   while (RTE_PER_LCORE(_next_flow) < 0)
+   RTE_PER_LCORE(_next_flow) += cfg_n_flows;
 
fs->fwd_dropped += nb_pkt - nb_tx;
do {
-- 
2.11.0

[dpdk-dev] [PATCH v4 0/4] app/testpmd: flowgen fixes and improvements

2021-08-12 Thread Zhihong Wang

This series fixes a tx retry defect and improves multi-core performance
by using per-core variable for flow indexing.

v4: use loop local variable to improve performance
v3: split changes and keep original flow generation logic
v2: fix assigning ip header cksum

Zhihong Wang (4):
  app/testpmd: fix tx retry in flowgen
  app/testpmd: use rte_ipv4_cksum in flowgen
  app/testpmd: record rx_burst and fwd_dropped in flowgen
  app/testpmd: use per-core variable in flowgen

 app/test-pmd/flowgen.c | 47 ++-
 1 file changed, 14 insertions(+), 33 deletions(-)

-- 
2.11.0

[dpdk-dev] [PATCH v4 1/4] app/testpmd: fix tx retry in flowgen

2021-08-12 Thread Zhihong Wang

Fix tx_pkt number in tx retry logic.

Fixes: bf56fce1fb4 ("app/testpmd: add retry option")
Cc: sta...@dpdk.org

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 3bf6e1ce97..f2e6255c36 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -192,12 +192,12 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
/*
 * Retry if necessary
 */
-   if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
+   if (unlikely(nb_tx < nb_pkt) && fs->retry_enabled) {
retry = 0;
-   while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+   while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
rte_delay_us(burst_tx_delay_time);
nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
-   &pkts_burst[nb_tx], nb_rx - nb_tx);
+   &pkts_burst[nb_tx], nb_pkt - nb_tx);
}
}
fs->tx_packets += nb_tx;
-- 
2.11.0

[dpdk-dev] [PATCH v4 2/4] app/testpmd: use rte_ipv4_cksum in flowgen

2021-08-12 Thread Zhihong Wang

Use the rte_ipv4_cksum API to replace local ip_sum implementation.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 25 +
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index f2e6255c36..96d0cc79df 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -53,28 +53,6 @@ static struct rte_ether_addr cfg_ether_dst =
 
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
-/* Use this type to inform GCC that ip_sum violates aliasing rules. */
-typedef unaligned_uint16_t alias_int16_t __attribute__((__may_alias__));
-
-static inline uint16_t
-ip_sum(const alias_int16_t *hdr, int hdr_len)
-{
-   uint32_t sum = 0;
-
-   while (hdr_len > 1)
-   {
-   sum += *hdr++;
-   if (sum & 0x8000)
-   sum = (sum & 0x) + (sum >> 16);
-   hdr_len -= 2;
-   }
-
-   while (sum >> 16)
-   sum = (sum & 0x) + (sum >> 16);
-
-   return ~sum;
-}
-
 /*
  * Multi-flow generation mode.
  *
@@ -160,8 +138,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
   next_flow);
ip_hdr->total_length= RTE_CPU_TO_BE_16(pkt_size -
   
sizeof(*eth_hdr));
-   ip_hdr->hdr_checksum= ip_sum((const alias_int16_t 
*)ip_hdr,
-sizeof(*ip_hdr));
+   ip_hdr->hdr_checksum= rte_ipv4_cksum(ip_hdr);
 
/* Initialize UDP header. */
udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
-- 
2.11.0

[dpdk-dev] [PATCH v4 3/4] app/testpmd: record rx_burst and fwd_dropped in flowgen

2021-08-12 Thread Zhihong Wang

Call inc_rx_burst_stats for rx operation, and record fwd_dropped.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 96d0cc79df..229794ee9c 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -87,6 +87,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
/* Receive a burst of packets and discard them. */
nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 nb_pkt_per_burst);
+   inc_rx_burst_stats(fs, nb_rx);
fs->rx_packets += nb_rx;
 
for (i = 0; i < nb_rx; i++)
@@ -186,6 +187,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
while (next_flow < 0)
next_flow += cfg_n_flows;
 
+   fs->fwd_dropped += nb_pkt - nb_tx;
do {
rte_pktmbuf_free(pkts_burst[nb_tx]);
} while (++nb_tx < nb_pkt);
-- 
2.11.0

[dpdk-dev] [PATCH v4 4/4] app/testpmd: use per-core variable in flowgen

2021-08-12 Thread Zhihong Wang

Use per-core variable for flow indexing to solve cache contention in
multi-core scenarios.

v4: use loop local variable to improve performance

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/flowgen.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 229794ee9c..b541485304 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -53,6 +53,8 @@ static struct rte_ether_addr cfg_ether_dst =
 
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
+RTE_DEFINE_PER_LCORE(int, _next_flow);
+
 /*
  * Multi-flow generation mode.
  *
@@ -80,7 +82,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
uint32_t retry;
uint64_t tx_offloads;
uint64_t start_tsc = 0;
-   static int next_flow = 0;
+   int next_flow = RTE_PER_LCORE(_next_flow);
 
get_start_cycles(&start_tsc);
 
@@ -193,6 +195,8 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
} while (++nb_tx < nb_pkt);
}
 
+   RTE_PER_LCORE(_next_flow) = next_flow;
+
get_end_cycles(fs, start_tsc);
 }
 
-- 
2.11.0

[dpdk-dev] [PATCH v5 0/4] app/testpmd: flowgen fixes and improvements

2021-08-13 Thread Zhihong Wang

This series fixes a tx retry defect and improves multi-core performance
by using per-core variable for flow indexing.

v5: replace modulo operation to improve performance
v4: use loop local variable to improve performance
v3: split changes and keep original flow generation logic
v2: fix assigning ip header cksum

Zhihong Wang (4):
  app/testpmd: fix tx retry in flowgen
  app/testpmd: use rte_ipv4_cksum in flowgen
  app/testpmd: record rx_burst and fwd_dropped in flowgen
  app/testpmd: use per-core variable in flowgen

 app/test-pmd/flowgen.c | 47 ++-
 1 file changed, 14 insertions(+), 33 deletions(-)

-- 
2.11.0

[dpdk-dev] [PATCH v5 1/4] app/testpmd: fix tx retry in flowgen

2021-08-13 Thread Zhihong Wang

Fix tx_pkt number in tx retry logic.

Fixes: bf56fce1fb45 ("app/testpmd: add retry option")
Cc: sta...@dpdk.org

Signed-off-by: Zhihong Wang 
Acked-by: Xiaoyun Li 
---
 app/test-pmd/flowgen.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 3bf6e1ce97..f2e6255c36 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -192,12 +192,12 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
/*
 * Retry if necessary
 */
-   if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
+   if (unlikely(nb_tx < nb_pkt) && fs->retry_enabled) {
retry = 0;
-   while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
+   while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
rte_delay_us(burst_tx_delay_time);
nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
-   &pkts_burst[nb_tx], nb_rx - nb_tx);
+   &pkts_burst[nb_tx], nb_pkt - nb_tx);
}
}
fs->tx_packets += nb_tx;
-- 
2.11.0

[dpdk-dev] [PATCH v5 2/4] app/testpmd: use rte_ipv4_cksum in flowgen

2021-08-13 Thread Zhihong Wang

Use the rte_ipv4_cksum API to replace local ip_sum implementation.

Signed-off-by: Zhihong Wang 
Acked-by: Xiaoyun Li 
---
 app/test-pmd/flowgen.c | 25 +
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index f2e6255c36..96d0cc79df 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -53,28 +53,6 @@ static struct rte_ether_addr cfg_ether_dst =
 
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
-/* Use this type to inform GCC that ip_sum violates aliasing rules. */
-typedef unaligned_uint16_t alias_int16_t __attribute__((__may_alias__));
-
-static inline uint16_t
-ip_sum(const alias_int16_t *hdr, int hdr_len)
-{
-   uint32_t sum = 0;
-
-   while (hdr_len > 1)
-   {
-   sum += *hdr++;
-   if (sum & 0x8000)
-   sum = (sum & 0x) + (sum >> 16);
-   hdr_len -= 2;
-   }
-
-   while (sum >> 16)
-   sum = (sum & 0x) + (sum >> 16);
-
-   return ~sum;
-}
-
 /*
  * Multi-flow generation mode.
  *
@@ -160,8 +138,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
   next_flow);
ip_hdr->total_length= RTE_CPU_TO_BE_16(pkt_size -
   
sizeof(*eth_hdr));
-   ip_hdr->hdr_checksum= ip_sum((const alias_int16_t 
*)ip_hdr,
-sizeof(*ip_hdr));
+   ip_hdr->hdr_checksum= rte_ipv4_cksum(ip_hdr);
 
/* Initialize UDP header. */
udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
-- 
2.11.0

[dpdk-dev] [PATCH v5 3/4] app/testpmd: record rx_burst and fwd_dropped in flowgen

2021-08-13 Thread Zhihong Wang

Call inc_rx_burst_stats for rx operation, and record fwd_dropped.

Signed-off-by: Zhihong Wang 
Acked-by: Xiaoyun Li 
---
 app/test-pmd/flowgen.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 96d0cc79df..229794ee9c 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -87,6 +87,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
/* Receive a burst of packets and discard them. */
nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
 nb_pkt_per_burst);
+   inc_rx_burst_stats(fs, nb_rx);
fs->rx_packets += nb_rx;
 
for (i = 0; i < nb_rx; i++)
@@ -186,6 +187,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
while (next_flow < 0)
next_flow += cfg_n_flows;
 
+   fs->fwd_dropped += nb_pkt - nb_tx;
do {
rte_pktmbuf_free(pkts_burst[nb_tx]);
} while (++nb_tx < nb_pkt);
-- 
2.11.0

[dpdk-dev] [PATCH v5 4/4] app/testpmd: use per-core variable in flowgen

2021-08-13 Thread Zhihong Wang

Use per-core variable for flow indexing to solve cache contention in
multi-core scenarios.

Signed-off-by: Zhihong Wang 
Acked-by: Xiaoyun Li 
---
v5: replace modulo operation to improve performance
v4: use loop local variable to improve performance

 app/test-pmd/flowgen.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 229794ee9c..9348618d0f 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -53,6 +53,8 @@ static struct rte_ether_addr cfg_ether_dst =
 
 #define IP_DEFTTL  64   /* from RFC 1340. */
 
+RTE_DEFINE_PER_LCORE(int, _next_flow);
+
 /*
  * Multi-flow generation mode.
  *
@@ -80,7 +82,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
uint32_t retry;
uint64_t tx_offloads;
uint64_t start_tsc = 0;
-   static int next_flow = 0;
+   int next_flow = RTE_PER_LCORE(_next_flow);
 
get_start_cycles(&start_tsc);
 
@@ -163,7 +165,8 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
}
pkts_burst[nb_pkt] = pkt;
 
-   next_flow = (next_flow + 1) % cfg_n_flows;
+   if (++next_flow >= (int)cfg_n_flows)
+   next_flow = 0;
}
 
nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
@@ -193,6 +196,8 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
} while (++nb_tx < nb_pkt);
}
 
+   RTE_PER_LCORE(_next_flow) = next_flow;
+
get_end_cycles(fs, start_tsc);
 }
 
-- 
2.11.0

[PATCH] ring: fix overflow in memory size calcuation

2021-12-13 Thread Zhihong Wang

Parameters count and esize are both unsigned int, and their product can
legally exceed unsigned int and lead to runtime access violation.

Fixes: cc4b218790f6 ("ring: support configurable element size")
Cc: sta...@dpdk.org

Signed-off-by: Zhihong Wang 
---
 lib/ring/rte_ring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/ring/rte_ring.c b/lib/ring/rte_ring.c
index f17bd966be..d1b80597af 100644
--- a/lib/ring/rte_ring.c
+++ b/lib/ring/rte_ring.c
@@ -75,7 +75,7 @@ rte_ring_get_memsize_elem(unsigned int esize, unsigned int 
count)
return -EINVAL;
}
 
-   sz = sizeof(struct rte_ring) + count * esize;
+   sz = sizeof(struct rte_ring) + (ssize_t)count * esize;
sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
return sz;
 }
-- 
2.11.0

[PATCH] eal/linux: register mp hotplug callback after memory init

2023-05-30 Thread Zhihong Wang

Secondary would crash if it tries to handle mp requests before memory
init, since globals such as eth_dev_shared_data_lock are not accessible
to it at this moment.
---
 lib/eal/linux/eal.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index ae323cd492..a74d564597 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -1058,12 +1058,6 @@ rte_eal_init(int argc, char **argv)
}
}
 
-   /* register multi-process action callbacks for hotplug */
-   if (eal_mp_dev_hotplug_init() < 0) {
-   rte_eal_init_alert("failed to register mp callback for 
hotplug");
-   return -1;
-   }
-
if (rte_bus_scan()) {
rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
@@ -1221,6 +1215,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}
 
+   /* register multi-process action callbacks for hotplug after memory 
init */
+   if (eal_mp_dev_hotplug_init() < 0) {
+   rte_eal_init_alert("failed to register mp callback for 
hotplug");
+   return -1;
+   }
+
if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
-- 
2.11.0

[PATCH v2] eal/linux: register mp hotplug callback after memory init

2023-06-08 Thread Zhihong Wang

Secondary would crash if it tries to handle mp requests before memory
init, since globals such as eth_dev_shared_data_lock are not accessible
to it at this moment.

v2: add signed-off-by

Signed-off-by: Zhihong Wang 
Acked-by: Anatoly Burakov 
---
 lib/eal/linux/eal.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index ae323cd492..a74d564597 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -1058,12 +1058,6 @@ rte_eal_init(int argc, char **argv)
}
}
 
-   /* register multi-process action callbacks for hotplug */
-   if (eal_mp_dev_hotplug_init() < 0) {
-   rte_eal_init_alert("failed to register mp callback for 
hotplug");
-   return -1;
-   }
-
if (rte_bus_scan()) {
rte_eal_init_alert("Cannot scan the buses for devices");
rte_errno = ENODEV;
@@ -1221,6 +1215,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}
 
+   /* register multi-process action callbacks for hotplug after memory 
init */
+   if (eal_mp_dev_hotplug_init() < 0) {
+   rte_eal_init_alert("failed to register mp callback for 
hotplug");
+   return -1;
+   }
+
if (rte_eal_tailqs_init() < 0) {
rte_eal_init_alert("Cannot init tail queues for objects");
rte_errno = EFAULT;
-- 
2.11.0

[dpdk-dev] [PATCH] app/testpmd: configurable number of flows in flowgen

2021-08-19 Thread Zhihong Wang

Make number of flows in flowgen configurable by setting parameter
--flowgen-flows=N.

Signed-off-by: Zhihong Wang 
---
Depends-on: series-18277 ("app/testpmd: flowgen fixes and improvements")

 app/test-pmd/flowgen.c| 22 ++
 app/test-pmd/parameters.c | 10 ++
 app/test-pmd/testpmd.c|  1 +
 app/test-pmd/testpmd.h|  1 +
 doc/guides/testpmd_app_ug/run_app.rst |  5 +
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 9348618d0f..9910a4dc53 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -40,8 +40,6 @@
 
 #include "testpmd.h"
 
-/* hardcoded configuration (for now) */
-static unsigned cfg_n_flows= 1024;
 static uint32_t cfg_ip_src = RTE_IPV4(10, 254, 0, 0);
 static uint32_t cfg_ip_dst = RTE_IPV4(10, 253, 0, 0);
 static uint16_t cfg_udp_src= 1000;
@@ -76,6 +74,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
uint64_t ol_flags = 0;
uint16_t nb_rx;
uint16_t nb_tx;
+   uint16_t nb_dropped;
uint16_t nb_pkt;
uint16_t nb_clones = nb_pkt_flowgen_clones;
uint16_t i;
@@ -165,7 +164,7 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
}
pkts_burst[nb_pkt] = pkt;
 
-   if (++next_flow >= (int)cfg_n_flows)
+   if (++next_flow >= nb_flows_flowgen)
next_flow = 0;
}
 
@@ -184,13 +183,14 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
fs->tx_packets += nb_tx;
 
inc_tx_burst_stats(fs, nb_tx);
-   if (unlikely(nb_tx < nb_pkt)) {
+   nb_dropped = nb_pkt - nb_tx;
+   if (unlikely(nb_dropped > 0)) {
/* Back out the flow counter. */
-   next_flow -= (nb_pkt - nb_tx);
+   next_flow -= nb_dropped;
while (next_flow < 0)
-   next_flow += cfg_n_flows;
+   next_flow += nb_flows_flowgen;
 
-   fs->fwd_dropped += nb_pkt - nb_tx;
+   fs->fwd_dropped += nb_dropped;
do {
rte_pktmbuf_free(pkts_burst[nb_tx]);
} while (++nb_tx < nb_pkt);
@@ -201,9 +201,15 @@ pkt_burst_flow_gen(struct fwd_stream *fs)
get_end_cycles(fs, start_tsc);
 }
 
+static void
+flowgen_begin(portid_t pi)
+{
+   printf("nb flows from port %u: %d\n", pi, nb_flows_flowgen);
+}
+
 struct fwd_engine flow_gen_engine = {
.fwd_mode_name  = "flowgen",
-   .port_fwd_begin = NULL,
+   .port_fwd_begin = flowgen_begin,
.port_fwd_end   = NULL,
.packet_fwd = pkt_burst_flow_gen,
 };
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 7c13210f04..825275e683 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -143,6 +143,7 @@ usage(char* progname)
   "N.\n");
printf("  --burst=N: set the number of packets per burst to N.\n");
printf("  --flowgen-clones=N: set the number of single packet clones to 
send in flowgen mode. Should be less than burst value.\n");
+   printf("  --flowgen-flows=N: set the number of flows in flowgen mode to 
N (1 <= N <= 2147483647).\n");
printf("  --mbcache=N: set the cache of mbuf memory pool to N.\n");
printf("  --rxpt=N: set prefetch threshold register of RX rings to 
N.\n");
printf("  --rxht=N: set the host threshold register of RX rings to 
N.\n");
@@ -586,6 +587,7 @@ launch_args_parse(int argc, char** argv)
{ "hairpin-mode",   1, 0, 0 },
{ "burst",  1, 0, 0 },
{ "flowgen-clones", 1, 0, 0 },
+   { "flowgen-flows",  1, 0, 0 },
{ "mbcache",1, 0, 0 },
{ "txpt",   1, 0, 0 },
{ "txht",   1, 0, 0 },
@@ -1122,6 +1124,14 @@ launch_args_parse(int argc, char** argv)
rte_exit(EXIT_FAILURE,
 "clones must be >= 0 and <= 
current burst\n");
}
+   if (!strcmp(lgopts[opt_idx].name, "flowgen-flows")) {
+   n = atoi(optarg);
+   if (n > 0)
+   nb_flows_flowgen = (int) n;
+   else
+   rte_exit(EXIT_FAILURE,
+"flows must be >= 1\n");
+   }
if (!strcmp(lgopts[opt_idx].name, &

[dpdk-dev] [PATCH] vhost: optimize vhost memcpy

2016-12-02 Thread Zhihong Wang

This patch optimizes Vhost performance for large packets when the
Mergeable Rx buffer feature is enabled. It introduces a dedicated
memcpy function for vhost enqueue/dequeue to replace rte_memcpy.

The reason is that rte_memcpy is for general cases, it handles
unaligned copies and make store aligned, it even makes load aligned
for micro architectures like Ivy Bridge. However alignment handling
comes at a price: It introduces extra load/store instructions.

Vhost memcpy is rather special: The copy is aligned, and remote,
and there is header write along which is also remote. In this case
the memcpy instruction stream should be simplified, to reduce extra
load/store, therefore reduce the probability of load/store buffer
full caused pipeline stall, to let the actual memcpy instructions
be issued and let H/W prefetcher goes to work as early as possible.

Performance gain is visible when packet size:

 1. Larger than 512 bytes on AVX/SSE platforms like Ivy Bridge

 2. Larger than 256 bytes on AVX2 platforms like Haswell

 3. Larger than 512 bytes on AVX512 platforms like Skylake

Up to 20% gain can be achieved by this patch for PVP traffic. The
test can also be conducted without NIC, by using loopback traffic
between Vhost and Virtio. For example, increase TXONLY_DEF_PACKET_LEN
to the requested packet size in testpmd.h, rebuild and start testpmd
in both host and guest, then "start" on one side and "start tx_first 32"
on the other.


Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/virtio_net.c | 72 +--
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 595f67c..cd6f21a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -50,6 +50,72 @@
 #define MAX_PKT_BURST 32
 #define VHOST_LOG_PAGE 4096
 
+/**
+ * This function is used to for vhost memcpy, to replace rte_memcpy.
+ * The reason is that rte_memcpy is for general cases, where vhost
+ * memcpy is a rather special case: The copy is aligned, and remote,
+ * and there is header write along which is also remote. In this case
+ * the memcpy instruction stream should be simplified to reduce extra
+ * load/store, therefore reduce the probability of load/store buffer
+ * full caused pipeline stall, to let the actual memcpy instructions
+ * be issued and let H/W prefetcher goes to work as early as possible.
+ */
+static inline void __attribute__((always_inline))
+vhost_memcpy(void *dst, const void *src, size_t n)
+{
+   /* Copy size <= 16 bytes */
+   if (n < 16) {
+   if (n & 0x01) {
+   *(uint8_t *)dst = *(const uint8_t *)src;
+   src = (const uint8_t *)src + 1;
+   dst = (uint8_t *)dst + 1;
+   }
+   if (n & 0x02) {
+   *(uint16_t *)dst = *(const uint16_t *)src;
+   src = (const uint16_t *)src + 1;
+   dst = (uint16_t *)dst + 1;
+   }
+   if (n & 0x04) {
+   *(uint32_t *)dst = *(const uint32_t *)src;
+   src = (const uint32_t *)src + 1;
+   dst = (uint32_t *)dst + 1;
+   }
+   if (n & 0x08)
+   *(uint64_t *)dst = *(const uint64_t *)src;
+
+   return;
+   }
+
+   /* Copy 16 <= size <= 32 bytes */
+   if (n <= 32) {
+   rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+   rte_mov16((uint8_t *)dst - 16 + n,
+   (const uint8_t *)src - 16 + n);
+
+   return;
+   }
+
+   /* Copy 32 < size <= 64 bytes */
+   if (n <= 64) {
+   rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+   rte_mov32((uint8_t *)dst - 32 + n,
+   (const uint8_t *)src - 32 + n);
+
+   return;
+   }
+
+   /* Copy 64 bytes blocks */
+   for (; n >= 64; n -= 64) {
+   rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+   dst = (uint8_t *)dst + 64;
+   src = (const uint8_t *)src + 64;
+   }
+
+   /* Copy whatever left */
+   rte_mov64((uint8_t *)dst - 64 + n,
+   (const uint8_t *)src - 64 + n);
+}
+
 static inline void __attribute__((always_inline))
 vhost_log_page(uint8_t *log_base, uint64_t page)
 {
@@ -246,7 +312,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vring_desc 
*descs,
}
 
cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-   rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
+   vhost_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
cpy_len);
vhost_log_

[dpdk-dev] [PATCH v2] eal: optimize aligned rte_memcpy

2016-12-07 Thread Zhihong Wang

This patch optimizes rte_memcpy for well aligned cases, where both
dst and src addr are aligned to maximum MOV width. It introduces a
dedicated function called rte_memcpy_aligned to handle the aligned
cases with simplified instruction stream. The existing rte_memcpy
is renamed as rte_memcpy_generic. The selection between them 2 is
done at the entry of rte_memcpy.

The existing rte_memcpy is for generic cases, it handles unaligned
copies and make store aligned, it even makes load aligned for micro
architectures like Ivy Bridge. However alignment handling comes at
a price: It adds extra load/store instructions, which can cause
complications sometime.

DPDK Vhost memcpy with Mergeable Rx Buffer feature as an example:
The copy is aligned, and remote, and there is header write along
which is also remote. In this case the memcpy instruction stream
should be simplified, to reduce extra load/store, therefore reduce
the probability of load/store buffer full caused pipeline stall, to
let the actual memcpy instructions be issued and let H/W prefetcher
goes to work as early as possible.

This patch is tested on Ivy Bridge, Haswell and Skylake, it provides
up to 20% gain for Virtio Vhost PVP traffic, with packet size ranging
from 64 to 1500 bytes.

The test can also be conducted without NIC, by setting loopback
traffic between Virtio and Vhost. For example, modify the macro
TXONLY_DEF_PACKET_LEN to the requested packet size in testpmd.h,
rebuild and start testpmd in both host and guest, then "start" on
one side and "start tx_first 32" on the other.


Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 81 +-
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index b3bfc23..b9785e8 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -69,6 +69,8 @@ rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));
 
 #ifdef RTE_MACHINE_CPUFLAG_AVX512F
 
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * AVX512 implementation below
  */
@@ -189,7 +191,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 }
 
 static inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
@@ -308,6 +310,8 @@ COPY_BLOCK_128_BACK63:
 
 #elif defined RTE_MACHINE_CPUFLAG_AVX2
 
+#define ALIGNMENT_MASK 0x1F
+
 /**
  * AVX2 implementation below
  */
@@ -387,7 +391,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 }
 
 static inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
@@ -499,6 +503,8 @@ COPY_BLOCK_128_BACK31:
 
 #else /* RTE_MACHINE_CPUFLAG */
 
+#define ALIGNMENT_MASK 0x0F
+
 /**
  * SSE & AVX implementation below
  */
@@ -677,7 +683,7 @@ __extension__ ({
  \
 })
 
 static inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
uintptr_t dstu = (uintptr_t)dst;
@@ -821,6 +827,75 @@ COPY_BLOCK_64_BACK15:
 
 #endif /* RTE_MACHINE_CPUFLAG */
 
+static inline void *
+rte_memcpy_aligned(void *dst, const void *src, size_t n)
+{
+   void *ret = dst;
+
+   /* Copy size <= 16 bytes */
+   if (n < 16) {
+   if (n & 0x01) {
+   *(uint8_t *)dst = *(const uint8_t *)src;
+   src = (const uint8_t *)src + 1;
+   dst = (uint8_t *)dst + 1;
+   }
+   if (n & 0x02) {
+   *(uint16_t *)dst = *(const uint16_t *)src;
+   src = (const uint16_t *)src + 1;
+   dst = (uint16_t *)dst + 1;
+   }
+   if (n & 0x04) {
+   *(uint32_t *)dst = *(const uint32_t *)src;
+   src = (const uint32_t *)src + 1;
+   dst = (uint32_t *)dst + 1;
+   }
+   if (n & 0x08)
+   *(uint64_t *)dst = *(const uint64_t *)src;
+
+   return ret;
+   }
+
+   /* Copy 16 <= size <= 32 bytes */
+   if (n <= 32) {
+   rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+   rte_mov16((uint8_t *)dst - 16 + n,
+   (const uint8_t *)src - 16 + n);
+
+   return ret;
+   }
+
+   /* Copy 32 < size <= 64 bytes */
+   if (n <= 64) {
+   rte_mov32((uint8_t *)dst, (const u

[dpdk-dev] [PATCH v5 0/6] vhost: optimize enqueue

2016-09-08 Thread Zhihong Wang

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

 4. Add details in commit log.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost.c  |  20 +-
 lib/librte_vhost/vhost.h  |   6 +-
 lib/librte_vhost/vhost_user.c |  31 ++-
 lib/librte_vhost/virtio_net.c | 561 +++---
 4 files changed, 242 insertions(+), 376 deletions(-)

-- 
2.7.4

[dpdk-dev] [PATCH v5 1/6] vhost: fix windows vm hang

2016-09-08 Thread Zhihong Wang

This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: 
Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Add details in commit log.

 lib/librte_vhost/virtio_net.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,

desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = desc_offset;

mbuf_avail  = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+   vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
-   vq->used->ring[used_idx].id  = desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
 ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = 0;
}

-   vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail  -= cpy_len;
desc_offset += cpy_len;
+   desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
-   vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
-- 
2.7.4

[dpdk-dev] [PATCH v5 2/6] vhost: rewrite enqueue

2016-09-08 Thread Zhihong Wang

This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

 lib/librte_vhost/virtio_net.c | 514 --
 1 file changed, 138 insertions(+), 376 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..6f63968 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
cksum));
break;
}
+   } else {
+   net_hdr->flags   = 0;
+   net_hdr->csum_start  = 0;
+   net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+   } else {
+   net_hdr->gso_type = 0;
+   net_hdr->hdr_len  = 0;
+   net_hdr->gso_size = 0;
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+   uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
+   vq->last_used_idx++;
+   vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+   ring[used_idx]),
+   sizeof(vq->used->ring[used_idx]));
 }

 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint16_t avail_idx, struct rte_mbuf *mbuf,
+   uint32_t is_mrg_rxbuf)
 {
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
+   uint32_t desc_current;
+   uint32_t desc_offset;
+   uint32_t mbuf_len;
+   uint32_t mbuf_avail;
+   uint32_t cpy_len;
+   uint32_t num_buffers = 0;

-   desc = &vq->desc[desc_idx];
+   /* start with the first mbuf of the packet */
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_avail = mbuf_len;
+
+   /* get the current desc */
+   desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+   desc_chain_head = desc_current;
+   desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-* performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-* otherwise stores offset on the stack instead of in a register.
-*/
-   if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-   return -1;
+   if (unlikely(!desc_addr))
+   goto error;

-

[dpdk-dev] [PATCH v5 3/6] vhost: remove useless volatile

2016-09-08 Thread Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

-- 
2.7.4

[dpdk-dev] [PATCH v5 4/6] vhost: add desc prefetch

2016-09-08 Thread Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/virtio_net.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6f63968..b38f18f 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -302,6 +302,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+   /* prefetch the next desc */
+   if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+   rte_prefetch0(&vq->desc[vq->avail->ring[
+   (vq->last_used_idx + 1) &
+   (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
-- 
2.7.4

[dpdk-dev] [PATCH v5 5/6] vhost: batch update used ring

2016-09-08 Thread Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang 
---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

 lib/librte_vhost/vhost.c  | 20 --
 lib/librte_vhost/vhost.h  |  4 +++
 lib/librte_vhost/vhost_user.c | 31 +
 lib/librte_vhost/virtio_net.c | 64 +++
 4 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+   struct vhost_virtqueue *vq_0;
+   struct vhost_virtqueue *vq_1;
uint32_t i;

-   for (i = 0; i < dev->virt_qp_nb; i++)
-   rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+   for (i = 0; i < dev->virt_qp_nb; i++) {
+   vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+   if (vq_0->shadow_used_ring) {
+   rte_free(vq_0->shadow_used_ring);
+   vq_0->shadow_used_ring = NULL;
+   }
+
+   vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+   if (vq_1->shadow_used_ring) {
+   rte_free(vq_1->shadow_used_ring);
+   vq_1->shadow_used_ring = NULL;
+   }
+
+   /* malloc together, free together */
+   rte_free(vq_0);
+   }

rte_free(dev);
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
 vhost_user_set_vring_num(struct virtio_net *dev,
 struct vhost_vring_state *state)
 {
-   dev->virtqueue[state->index]->size = state->num;
+   struct vhost_virtqueue *vq;
+
+   vq = dev->virtqueue[state->index];
+   vq->size = state->num;
+   if (!vq->shadow_used_ring) {
+   vq->shadow_used_ring = rte_malloc(NULL,
+   vq->size * sizeof(struct vring_used_elem),
+   RTE_CACHE_LINE_SIZE);
+   if (!vq->shadow_used_ring) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "Failed to allocate memory"
+   " for shadow used ring.\n");
+   return -1;
+   }
+   }

return 0;
 }
@@ -611,14 +625,21 @@ static int
 vhost_user_get_vring_base(struct virtio_net *dev,
  struct vhost_vring_state *state)
 {
+   struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}

+   vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
-   state->num = dev->virtqueue[state->index]->last_used_idx;
+   state->num = vq->last_used_idx;
+   if (vq->shadow_used_ring) {
+   rte_free(vq->shadow_used_ring);
+   vq->shadow_used_ring = NULL;
+   }

RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 * sent and only sent in vhost_vring_stop.
 * TODO: cleanup the vring, it isn't usable since here.
 */
-   if (dev->virtqueue[state->index]->kickfd >= 0)
-   close(dev->virtqueue[state->index]->kickfd);
+   if (vq->kickfd >= 0)
+   close(vq->kickfd);

-   dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+   vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;

return 0;
 }
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index b38f18f..e9f6353 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -134,17 +13

[dpdk-dev] [PATCH v5 6/6] vhost: optimize cache access

2016-09-08 Thread Zhihong Wang

This patch reorders the code to delay virtio header write to optimize cache
access efficiency for cases where the mrg_rxbuf feature is turned on. It
reduces CPU pipeline stall cycles significantly.

Signed-off-by: Zhihong Wang 
---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 lib/librte_vhost/virtio_net.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e9f6353..0086bcb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -197,6 +197,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t cpy_len;
+   uint32_t copy_virtio_hdr;
uint32_t num_buffers = 0;

/* start with the first mbuf of the packet */
@@ -211,12 +212,12 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

-   /* handle virtio header */
+   /*
+* handle virtio header, the actual write operation is delayed
+* for cache optimization, to reduce CPU pipeline stall cycles.
+*/
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-   if (is_mrg_rxbuf)
-   virtio_hdr->num_buffers = 1;
-
+   copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -266,8 +267,15 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto error;
}

-   /* copy mbuf data */
+   /* copy virtio header and mbuf data */
cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+   if (copy_virtio_hdr) {
+   copy_virtio_hdr = 0;
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = num_buffers + 1;
+   }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
-- 
2.7.4

[dpdk-dev] [PATCH v6 0/6] vhost: optimize enqueue

2016-09-19 Thread Zhihong Wang

This patch set optimizes the vhost enqueue function.

It implements the vhost logic from scratch into a single function designed
for high performance and good maintainability, and improves CPU efficiency
significantly by optimizing cache access, which means:

 *  Higher maximum throughput can be achieved for fast frontends like DPDK
virtio pmd.

 *  Better scalability can be achieved that each vhost core can support
more connections because it takes less cycles to handle each single
frontend.

This patch set contains:

 1. A Windows VM compatibility fix for vhost enqueue in 16.07 release.

 2. A baseline patch to rewrite the vhost logic.

 3. A series of optimization patches added upon the baseline.

The main optimization techniques are:

 1. Reorder code to reduce CPU pipeline stall cycles.

 2. Batch update the used ring for better efficiency.

 3. Prefetch descriptor to hide cache latency.

 4. Remove useless volatile attribute to allow compiler optimization.

Code reordering and batch used ring update bring most of the performance
improvements.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

The performance of the existing code is not optimal, especially when the
mrg_rxbuf feature turned on. Besides, having 2 callback paths increases
maintenance efforts.

Also, there's a compatibility issue in the existing code which causes
Windows VM to hang when the mrg_rxbuf feature turned on.

---
Changes in v6:

 1. Merge duplicated code.

 2. Introduce a function for used ring write.

 3. Add necessary comments.

---
Changes in v5:

 1. Rebase to dpdk-next-virtio master.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

 4. Add details in commit log.

---
Changes in v4:

 1. Fix a Windows VM compatibility issue.

 2. Free shadow used ring in the right place.

 3. Add failure check for shadow used ring malloc.

 4. Refactor the code for clearer logic.

 5. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 3. Rewrite enqueue and delete the obsolete in the same patch.

---
Changes in v2:

 1. Split the big function into several small ones.

 2. Use multiple patches to explain each optimization.

 3. Add comments.

Zhihong Wang (6):
  vhost: fix windows vm hang
  vhost: rewrite enqueue
  vhost: remove useless volatile
  vhost: add desc prefetch
  vhost: batch update used ring
  vhost: optimize cache access

 lib/librte_vhost/vhost.c  |  20 +-
 lib/librte_vhost/vhost.h  |   6 +-
 lib/librte_vhost/vhost_user.c |  31 ++-
 lib/librte_vhost/virtio_net.c | 541 ++
 4 files changed, 225 insertions(+), 373 deletions(-)

-- 
2.7.4

[dpdk-dev] [PATCH v6 1/6] vhost: fix windows vm hang

2016-09-19 Thread Zhihong Wang

This patch fixes a Windows VM compatibility issue in DPDK 16.07 vhost code
which causes the guest to hang once any packets are enqueued when mrg_rxbuf
is turned on by setting the right id and len in the used ring.

As defined in virtio spec 0.95 and 1.0, in each used ring element, id means
index of start of used descriptor chain, and len means total length of the
descriptor chain which was written to. While in 16.07 code, index of the
last descriptor is assigned to id, and the length of the last descriptor is
assigned to len.

How to test?

 1. Start testpmd in the host with a vhost port.

 2. Start a Windows VM image with qemu and connect to the vhost port.

 3. Start io forwarding with tx_first in host testpmd.

For 16.07 code, the Windows VM will hang once any packets are enqueued.

Cc: 
Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Add details in commit log.

 lib/librte_vhost/virtio_net.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..0d6e7d9 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -384,6 +384,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint16_t start_idx = vq->last_used_idx;
uint16_t cur_idx = start_idx;
uint64_t desc_addr;
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
uint32_t mbuf_offset, mbuf_avail;
uint32_t desc_offset, desc_avail;
uint32_t cpy_len;
@@ -412,6 +414,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct 
vhost_virtqueue *vq,

desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
desc_offset = dev->vhost_hlen;
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = desc_offset;

mbuf_avail  = rte_pktmbuf_data_len(m);
mbuf_offset = 0;
@@ -419,19 +423,21 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
/* done with current desc buf, get the next one */
if (desc_avail == 0) {
desc_idx = buf_vec[vec_idx].desc_idx;
+   vec_idx++;

if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
/* Update used ring with desc information */
used_idx = cur_idx++ & (vq->size - 1);
-   vq->used->ring[used_idx].id  = desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used,
 ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
+   desc_chain_head = buf_vec[vec_idx].desc_idx;
+   desc_chain_len = 0;
}

-   vec_idx++;
desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
if (unlikely(!desc_addr))
return 0;
@@ -463,11 +469,12 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, 
struct vhost_virtqueue *vq,
mbuf_offset += cpy_len;
desc_avail  -= cpy_len;
desc_offset += cpy_len;
+   desc_chain_len += cpy_len;
}

used_idx = cur_idx & (vq->size - 1);
-   vq->used->ring[used_idx].id = buf_vec[vec_idx].desc_idx;
-   vq->used->ring[used_idx].len = desc_offset;
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
-- 
2.7.4

[dpdk-dev] [PATCH v6 2/6] vhost: rewrite enqueue

2016-09-19 Thread Zhihong Wang

This patch implements the vhost logic from scratch into a single function
to improve maintainability. This is the baseline version of the new code,
more optimization will be added in the following patches in this patch set.

In the existing code there're 2 callbacks for vhost enqueue:

 *  virtio_dev_merge_rx for mrg_rxbuf turned on cases.

 *  virtio_dev_rx for mrg_rxbuf turned off cases.

Having 2 callback paths increases maintenance effort. Also, the performance
of the existing code is not optimal, especially when the mrg_rxbuf feature
turned on.

Signed-off-by: Zhihong Wang 
---
Changes in v6:

 1. Merge duplicated code.

 2. Add necessary comments.

---
Changes in v5:

 1. Rebase to dpdk-next-virtio master.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

 lib/librte_vhost/virtio_net.c | 508 +++---
 1 file changed, 134 insertions(+), 374 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..0ada32b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t 
qp_nb)
return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }

-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
cksum));
break;
}
+   } else {
+   net_hdr->flags   = 0;
+   net_hdr->csum_start  = 0;
+   net_hdr->csum_offset = 0;
}

if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,195 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct 
virtio_net_hdr *net_hdr)
net_hdr->gso_size = m_buf->tso_segsz;
net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
+ m_buf->l4_len;
+   } else {
+   net_hdr->gso_type = 0;
+   net_hdr->hdr_len  = 0;
+   net_hdr->gso_size = 0;
}
 }

-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-   struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-   if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-   *(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-   else
-   *(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+   uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+   vq->used->ring[used_idx].id = desc_chain_head;
+   vq->used->ring[used_idx].len = desc_chain_len;
+   vq->last_used_idx++;
+   vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+   ring[used_idx]),
+   sizeof(vq->used->ring[used_idx]));
 }

 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
- struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+   uint16_t avail_idx, struct rte_mbuf *mbuf,
+   uint32_t is_mrg_rxbuf)
 {
-   uint32_t desc_avail, desc_offset;
-   uint32_t mbuf_avail, mbuf_offset;
-   uint32_t cpy_len;
+   struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
struct vring_desc *desc;
uint64_t desc_addr;
-   struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+   uint32_t desc_chain_head;
+   uint32_t desc_chain_len;
+   uint32_t desc_current;
+   uint32_t desc_offset;
+   uint32_t mbuf_len;
+   uint32_t mbuf_avail;
+   uint32_t cpy_len;
+   uint32_t num_buffers = 0;

-   desc = &vq->desc[desc_idx];
+   /* start with the first mbuf of the packet */
+   mbuf_len = rte_pktmbuf_data_len(mbuf);
+   mbuf_avail = mbuf_len;
+
+   /* get the current desc */
+   desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+   desc_chain_head = desc_current;
+   desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
-   /*
-* Checking of 'desc_addr' pla

[dpdk-dev] [PATCH v6 3/6] vhost: remove useless volatile

2016-09-19 Thread Zhihong Wang

This patch removes useless volatile attribute to allow compiler
optimization.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index c2dfc3c..9707dfc 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_virtqueue {
uint32_tsize;

/* Last index used on the available ring */
-   volatile uint16_t   last_used_idx;
+   uint16_tlast_used_idx;
 #define VIRTIO_INVALID_EVENTFD (-1)
 #define VIRTIO_UNINITIALIZED_EVENTFD   (-2)

-- 
2.7.4

[dpdk-dev] [PATCH v6 4/6] vhost: add desc prefetch

2016-09-19 Thread Zhihong Wang

This patch adds descriptor prefetch to hide cache access latency.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/virtio_net.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0ada32b..f32a143 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -300,6 +300,12 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
/* start enqueuing packets 1 by 1 */
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
while (pkt_left && avail_idx != vq->last_used_idx) {
+   /* prefetch the next desc */
+   if (pkt_left > 1 && avail_idx != vq->last_used_idx + 1)
+   rte_prefetch0(&vq->desc[vq->avail->ring[
+   (vq->last_used_idx + 1) &
+   (vq->size - 1)]]);
+
if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
is_mrg_rxbuf))
break;
-- 
2.7.4

[dpdk-dev] [PATCH v6 5/6] vhost: batch update used ring

2016-09-19 Thread Zhihong Wang

This patch enables batch update of the used ring for better efficiency.

Signed-off-by: Zhihong Wang 
---
Changes in v6:

 1. Introduce a function for used ring write.

---
Changes in v4:

 1. Free shadow used ring in the right place.

 2. Add failure check for shadow used ring malloc.

 lib/librte_vhost/vhost.c  | 20 +++--
 lib/librte_vhost/vhost.h  |  4 
 lib/librte_vhost/vhost_user.c | 31 +-
 lib/librte_vhost/virtio_net.c | 52 ++-
 4 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..cb31cdd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -119,10 +119,26 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 free_device(struct virtio_net *dev)
 {
+   struct vhost_virtqueue *vq_0;
+   struct vhost_virtqueue *vq_1;
uint32_t i;

-   for (i = 0; i < dev->virt_qp_nb; i++)
-   rte_free(dev->virtqueue[i * VIRTIO_QNUM]);
+   for (i = 0; i < dev->virt_qp_nb; i++) {
+   vq_0 = dev->virtqueue[i * VIRTIO_QNUM];
+   if (vq_0->shadow_used_ring) {
+   rte_free(vq_0->shadow_used_ring);
+   vq_0->shadow_used_ring = NULL;
+   }
+
+   vq_1 = dev->virtqueue[i * VIRTIO_QNUM + 1];
+   if (vq_1->shadow_used_ring) {
+   rte_free(vq_1->shadow_used_ring);
+   vq_1->shadow_used_ring = NULL;
+   }
+
+   /* malloc together, free together */
+   rte_free(vq_0);
+   }

rte_free(dev);
 }
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 9707dfc..381dc27 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -85,6 +85,10 @@ struct vhost_virtqueue {

/* Physical address of used ring, for logging */
uint64_tlog_guest_addr;
+
+   /* Shadow used ring for performance */
+   struct vring_used_elem  *shadow_used_ring;
+   uint32_tshadow_used_idx;
 } __rte_cache_aligned;

 /* Old kernels have no such macro defined */
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index eee99e9..d7cf1ed 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -193,7 +193,21 @@ static int
 vhost_user_set_vring_num(struct virtio_net *dev,
 struct vhost_vring_state *state)
 {
-   dev->virtqueue[state->index]->size = state->num;
+   struct vhost_virtqueue *vq;
+
+   vq = dev->virtqueue[state->index];
+   vq->size = state->num;
+   if (!vq->shadow_used_ring) {
+   vq->shadow_used_ring = rte_malloc(NULL,
+   vq->size * sizeof(struct vring_used_elem),
+   RTE_CACHE_LINE_SIZE);
+   if (!vq->shadow_used_ring) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "Failed to allocate memory"
+   " for shadow used ring.\n");
+   return -1;
+   }
+   }

return 0;
 }
@@ -611,14 +625,21 @@ static int
 vhost_user_get_vring_base(struct virtio_net *dev,
  struct vhost_vring_state *state)
 {
+   struct vhost_virtqueue *vq;
+
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
dev->flags &= ~VIRTIO_DEV_RUNNING;
notify_ops->destroy_device(dev->vid);
}

+   vq = dev->virtqueue[state->index];
/* Here we are safe to get the last used index */
-   state->num = dev->virtqueue[state->index]->last_used_idx;
+   state->num = vq->last_used_idx;
+   if (vq->shadow_used_ring) {
+   rte_free(vq->shadow_used_ring);
+   vq->shadow_used_ring = NULL;
+   }

RTE_LOG(INFO, VHOST_CONFIG,
"vring base idx:%d file:%d\n", state->index, state->num);
@@ -627,10 +648,10 @@ vhost_user_get_vring_base(struct virtio_net *dev,
 * sent and only sent in vhost_vring_stop.
 * TODO: cleanup the vring, it isn't usable since here.
 */
-   if (dev->virtqueue[state->index]->kickfd >= 0)
-   close(dev->virtqueue[state->index]->kickfd);
+   if (vq->kickfd >= 0)
+   close(vq->kickfd);

-   dev->virtqueue[state->index]->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+   vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;

return 0;
 }
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index f32a143..8f2882b 100644
--- a/lib/librt

[dpdk-dev] [PATCH v6 6/6] vhost: optimize cache access

2016-09-19 Thread Zhihong Wang

This patch reorders the code to delay virtio header write to improve
cache access efficiency for cases where the mrg_rxbuf feature is turned
on. CPU pipeline stall cycles can be significantly reduced.

Virtio header write and mbuf data copy are all remote store operations
which takes a long time to finish. It's a good idea to put them together
to remove bubbles in between, to let as many remote store instructions
as possible go into store buffer at the same time to hide latency, and
to let the H/W prefetcher goes to work as early as possible.

On a Haswell machine, about 100 cycles can be saved per packet by this
patch alone. Taking 64B packets traffic for example, this means about 60%
efficiency improvement for the enqueue operation.

Signed-off-by: Zhihong Wang 
---
Changes in v3:

 1. Remove unnecessary memset which causes frontend stall on SNB & IVB.

 2. Rename variables to follow naming convention.

 lib/librte_vhost/virtio_net.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8f2882b..11a2c1a 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -185,6 +185,7 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
uint32_t mbuf_len;
uint32_t mbuf_avail;
uint32_t cpy_len;
+   uint32_t copy_virtio_hdr;
uint32_t num_buffers = 0;

/* start with the first mbuf of the packet */
@@ -199,12 +200,12 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
if (unlikely(!desc_addr))
goto error;

-   /* handle virtio header */
+   /*
+* handle virtio header, the actual write operation is delayed
+* for cache optimization, to reduce CPU pipeline stall cycles.
+*/
virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
-   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
-   if (is_mrg_rxbuf)
-   virtio_hdr->num_buffers = 1;
-
+   copy_virtio_hdr = 1;
vhost_log_write(dev, desc->addr, dev->vhost_hlen);
PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
desc_offset = dev->vhost_hlen;
@@ -249,8 +250,15 @@ enqueue_packet(struct virtio_net *dev, struct 
vhost_virtqueue *vq,
goto error;
}

-   /* copy mbuf data */
+   /* copy virtio header and mbuf data */
cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+   if (copy_virtio_hdr) {
+   copy_virtio_hdr = 0;
+   virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+   if (is_mrg_rxbuf)
+   virtio_hdr->num_buffers = num_buffers + 1;
+   }
+
rte_memcpy((void *)(uintptr_t)desc_addr,
rte_pktmbuf_mtod_offset(mbuf, void *,
mbuf_len - mbuf_avail),
-- 
2.7.4

[dpdk-dev] [PATCH] config: make AVX and AVX512 configurable

2017-04-26 Thread Zhihong Wang

Making AVX and AVX512 configurable is useful for performance and power
testing.

The similar kernel patch at https://patchwork.kernel.org/patch/9618883/.

Signed-off-by: Zhihong Wang 
---
 config/common_base | 6 ++
 mk/rte.cpuflags.mk | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/config/common_base b/config/common_base
index 0b4297c..9ca94a8 100644
--- a/config/common_base
+++ b/config/common_base
@@ -103,6 +103,12 @@ CONFIG_RTE_EAL_IGB_UIO=n
 CONFIG_RTE_EAL_VFIO=n
 CONFIG_RTE_MALLOC_DEBUG=n
 
+#
+# Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing
+#
+CONFIG_RTE_ENABLE_AVX=y
+CONFIG_RTE_ENABLE_AVX512=n
+
 # Default driver path (or "" to disable)
 CONFIG_RTE_EAL_PMD_PATH=""
 
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..4288c14 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -70,8 +70,10 @@ CPUFLAGS += PCLMULQDQ
 endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX__),)
+ifeq ($(CONFIG_RTE_ENABLE_AVX),y)
 CPUFLAGS += AVX
 endif
+endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__RDRND__),)
 CPUFLAGS += RDRAND
@@ -86,12 +88,16 @@ CPUFLAGS += F16C
 endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX2__),)
+ifeq ($(CONFIG_RTE_ENABLE_AVX),y)
 CPUFLAGS += AVX2
 endif
+endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
+ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
 CPUFLAGS += AVX512F
 endif
+endif
 
 # IBM Power CPU flags
 ifneq ($(filter $(AUTO_CPUFLAGS),__PPC64__),)
-- 
2.7.4

[dpdk-dev] [PATCH v2] config: make AVX and AVX512 configurable

2017-04-27 Thread Zhihong Wang

Making AVX and AVX512 configurable is useful for performance and power
testing.

The similar kernel patch at https://patchwork.kernel.org/patch/9618883/.

AVX512 support like in rte_memcpy has been in DPDK since 16.04, but it's
still unproven in rich use cases in hardware. Therefore it's marked as
experimental for now, will enable it after enough field test and possible
optimization.

Signed-off-by: Zhihong Wang 
---
Changes in v2:

 1. Add comments and explanation.

---
 config/common_base | 8 
 mk/rte.cpuflags.mk | 6 ++
 2 files changed, 14 insertions(+)

diff --git a/config/common_base b/config/common_base
index 0b4297c..93e9235 100644
--- a/config/common_base
+++ b/config/common_base
@@ -103,6 +103,14 @@ CONFIG_RTE_EAL_IGB_UIO=n
 CONFIG_RTE_EAL_VFIO=n
 CONFIG_RTE_MALLOC_DEBUG=n
 
+#
+# Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
+# AVX512 is marked as experimental for now, will enable it after enough
+# field test and possible optimization.
+#
+CONFIG_RTE_ENABLE_AVX=y
+CONFIG_RTE_ENABLE_AVX512=n
+
 # Default driver path (or "" to disable)
 CONFIG_RTE_EAL_PMD_PATH=""
 
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..4288c14 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -70,8 +70,10 @@ CPUFLAGS += PCLMULQDQ
 endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX__),)
+ifeq ($(CONFIG_RTE_ENABLE_AVX),y)
 CPUFLAGS += AVX
 endif
+endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__RDRND__),)
 CPUFLAGS += RDRAND
@@ -86,12 +88,16 @@ CPUFLAGS += F16C
 endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX2__),)
+ifeq ($(CONFIG_RTE_ENABLE_AVX),y)
 CPUFLAGS += AVX2
 endif
+endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
+ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
 CPUFLAGS += AVX512F
 endif
+endif
 
 # IBM Power CPU flags
 ifneq ($(filter $(AUTO_CPUFLAGS),__PPC64__),)
-- 
2.7.4

[dpdk-dev] [PATCH] vhost: support rx_queue_count

2017-05-22 Thread Zhihong Wang

This patch implements the ops rx_queue_count for vhost PMD by adding
a helper function rte_vhost_rx_queue_count in vhost lib.

The ops ops rx_queue_count gets vhost RX queue avail count and helps
to understand the queue fill level.

Signed-off-by: Zhihong Wang 
---
 drivers/net/vhost/rte_eth_vhost.c  | 13 +
 lib/librte_vhost/rte_vhost.h   | 12 
 lib/librte_vhost/rte_vhost_version.map |  7 +++
 lib/librte_vhost/vhost.c   | 23 +++
 4 files changed, 55 insertions(+)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 257bf6d..e3a3fe0 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -973,6 +973,18 @@ eth_link_update(struct rte_eth_dev *dev __rte_unused,
return 0;
 }
 
+static uint32_t
+eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+   struct vhost_queue *vq;
+
+   vq = dev->data->rx_queues[rx_queue_id];
+   if (!vq)
+   return 0;
+
+   return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
+}
+
 static const struct eth_dev_ops ops = {
.dev_start = eth_dev_start,
.dev_stop = eth_dev_stop,
@@ -984,6 +996,7 @@ static const struct eth_dev_ops ops = {
.rx_queue_release = eth_queue_release,
.tx_queue_release = eth_queue_release,
.tx_done_cleanup = eth_tx_done_cleanup,
+   .rx_queue_count = eth_rx_queue_count,
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 605e47c..f64ed20 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -432,6 +432,18 @@ int rte_vhost_get_mem_table(int vid, struct 
rte_vhost_memory **mem);
 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
  struct rte_vhost_vring *vring);
 
+/**
+ * Get vhost RX queue avail count.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param qid
+ *  virtio queue index in mq case
+ * @return
+ *  num of desc available
+ */
+uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 0785873..1e70495 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -45,3 +45,10 @@ DPDK_17.05 {
rte_vhost_log_write;
 
 } DPDK_16.07;
+
+DPDK_17.08 {
+   global:
+
+   rte_vhost_rx_queue_count;
+
+} DPDK_17.05;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 0b19d2e..140d2ae 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -475,3 +475,26 @@ rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
vhost_log_used_vring(dev, vq, offset, len);
 }
+
+uint32_t
+rte_vhost_rx_queue_count(int vid, uint16_t qid)
+{
+   struct virtio_net *dev;
+   struct vhost_virtqueue *vq;
+
+   dev = get_device(vid);
+   if (!dev)
+   return 0;
+
+   if (unlikely(qid >= dev->nr_vring || (qid & 1) == 0)) {
+   RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+   dev->vid, __func__, qid);
+   return 0;
+   }
+
+   vq = dev->virtqueue[qid];
+   if (unlikely(vq->enabled == 0))
+   return 0;
+
+   return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
+}
-- 
2.7.4

[dpdk-dev] [PATCH v2] vhost: support rx_queue_count

2017-05-23 Thread Zhihong Wang

This patch implements the ops rx_queue_count for vhost PMD by adding
a helper function rte_vhost_rx_queue_count in vhost lib.

The ops rx_queue_count gets vhost RX queue avail count and helps to
understand the queue fill level.

Signed-off-by: Zhihong Wang 
---
Changes in v2:

 1. Fixed a typo in commit log.

 drivers/net/vhost/rte_eth_vhost.c  | 13 +
 lib/librte_vhost/rte_vhost.h   | 12 
 lib/librte_vhost/rte_vhost_version.map |  7 +++
 lib/librte_vhost/vhost.c   | 23 +++
 4 files changed, 55 insertions(+)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 257bf6d..e3a3fe0 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -973,6 +973,18 @@ eth_link_update(struct rte_eth_dev *dev __rte_unused,
return 0;
 }
 
+static uint32_t
+eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+   struct vhost_queue *vq;
+
+   vq = dev->data->rx_queues[rx_queue_id];
+   if (!vq)
+   return 0;
+
+   return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
+}
+
 static const struct eth_dev_ops ops = {
.dev_start = eth_dev_start,
.dev_stop = eth_dev_stop,
@@ -984,6 +996,7 @@ static const struct eth_dev_ops ops = {
.rx_queue_release = eth_queue_release,
.tx_queue_release = eth_queue_release,
.tx_done_cleanup = eth_tx_done_cleanup,
+   .rx_queue_count = eth_rx_queue_count,
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 605e47c..f64ed20 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -432,6 +432,18 @@ int rte_vhost_get_mem_table(int vid, struct 
rte_vhost_memory **mem);
 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
  struct rte_vhost_vring *vring);
 
+/**
+ * Get vhost RX queue avail count.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param qid
+ *  virtio queue index in mq case
+ * @return
+ *  num of desc available
+ */
+uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 0785873..1e70495 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -45,3 +45,10 @@ DPDK_17.05 {
rte_vhost_log_write;
 
 } DPDK_16.07;
+
+DPDK_17.08 {
+   global:
+
+   rte_vhost_rx_queue_count;
+
+} DPDK_17.05;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 0b19d2e..140d2ae 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -475,3 +475,26 @@ rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
vhost_log_used_vring(dev, vq, offset, len);
 }
+
+uint32_t
+rte_vhost_rx_queue_count(int vid, uint16_t qid)
+{
+   struct virtio_net *dev;
+   struct vhost_virtqueue *vq;
+
+   dev = get_device(vid);
+   if (!dev)
+   return 0;
+
+   if (unlikely(qid >= dev->nr_vring || (qid & 1) == 0)) {
+   RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+   dev->vid, __func__, qid);
+   return 0;
+   }
+
+   vq = dev->virtqueue[qid];
+   if (unlikely(vq->enabled == 0))
+   return 0;
+
+   return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
+}
-- 
2.7.4

[dpdk-dev] [PATCH v3] vhost: support rx_queue_count

2017-05-25 Thread Zhihong Wang

This patch implements the ops rx_queue_count for vhost PMD by adding
a helper function rte_vhost_rx_queue_count in vhost lib.

The ops rx_queue_count gets vhost RX queue avail count and helps to
understand the queue fill level.

Signed-off-by: Zhihong Wang 
Acked-by: Ciara Loftus 
---
Changes in v3:

 1. Added pointer check for vq and vq->avail.
 
 2. Fixed coding style.

---
Changes in v2:

 1. Fixed a typo in commit log.

 drivers/net/vhost/rte_eth_vhost.c  | 13 +
 lib/librte_vhost/rte_vhost.h   | 12 
 lib/librte_vhost/rte_vhost_version.map |  7 +++
 lib/librte_vhost/vhost.c   | 26 ++
 4 files changed, 58 insertions(+)

diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 257bf6d..ebcfb28 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -973,6 +973,18 @@ eth_link_update(struct rte_eth_dev *dev __rte_unused,
return 0;
 }
 
+static uint32_t
+eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+   struct vhost_queue *vq;
+
+   vq = dev->data->rx_queues[rx_queue_id];
+   if (vq == NULL)
+   return 0;
+
+   return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
+}
+
 static const struct eth_dev_ops ops = {
.dev_start = eth_dev_start,
.dev_stop = eth_dev_stop,
@@ -984,6 +996,7 @@ static const struct eth_dev_ops ops = {
.rx_queue_release = eth_queue_release,
.tx_queue_release = eth_queue_release,
.tx_done_cleanup = eth_tx_done_cleanup,
+   .rx_queue_count = eth_rx_queue_count,
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 605e47c..f64ed20 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -432,6 +432,18 @@ int rte_vhost_get_mem_table(int vid, struct 
rte_vhost_memory **mem);
 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
  struct rte_vhost_vring *vring);
 
+/**
+ * Get vhost RX queue avail count.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param qid
+ *  virtio queue index in mq case
+ * @return
+ *  num of desc available
+ */
+uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 0785873..1e70495 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -45,3 +45,10 @@ DPDK_17.05 {
rte_vhost_log_write;
 
 } DPDK_16.07;
+
+DPDK_17.08 {
+   global:
+
+   rte_vhost_rx_queue_count;
+
+} DPDK_17.05;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 0b19d2e..1b8e6bd 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -475,3 +475,29 @@ rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
 
vhost_log_used_vring(dev, vq, offset, len);
 }
+
+uint32_t
+rte_vhost_rx_queue_count(int vid, uint16_t qid)
+{
+   struct virtio_net *dev;
+   struct vhost_virtqueue *vq;
+
+   dev = get_device(vid);
+   if (dev == NULL)
+   return 0;
+
+   if (unlikely(qid >= dev->nr_vring || (qid & 1) == 0)) {
+   RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
+   dev->vid, __func__, qid);
+   return 0;
+   }
+
+   vq = dev->virtqueue[qid];
+   if (vq == NULL)
+   return 0;
+
+   if (unlikely(vq->enabled == 0 || vq->avail == NULL))
+   return 0;
+
+   return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
+}
-- 
2.7.4

[dpdk-dev] [PATCH] Unlink existing unused sockets at start up

2015-12-16 Thread Zhihong Wang

This patch unlinks existing unused sockets (which cause new bindings to fail, 
e.g. vHost PMD) to ensure smooth startup.
In a lot of cases DPDK applications are terminated abnormally without proper 
resource release. Therefore, DPDK libs should be able to deal with unclean boot 
environment.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost_user/vhost-net-user.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c 
b/lib/librte_vhost/vhost_user/vhost-net-user.c
index 8b7a448..eac0721 100644
--- a/lib/librte_vhost/vhost_user/vhost-net-user.c
+++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
@@ -120,18 +120,38 @@ uds_socket(const char *path)
sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
if (sockfd < 0)
return -1;
-   RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
+   RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd: %d\n", sockfd);

memset(&un, 0, sizeof(un));
un.sun_family = AF_UNIX;
snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
if (ret == -1) {
-   RTE_LOG(ERR, VHOST_CONFIG, "fail to bind fd:%d, remove file:%s 
and try again.\n",
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "bind fd: %d to file: %s failed, checking socket...\n",
sockfd, path);
-   goto err;
+   ret = connect(sockfd, (struct sockaddr *)&un, sizeof(un));
+   if (ret == -1) {
+   RTE_LOG(INFO, VHOST_CONFIG,
+   "socket: %s is inactive, rebinding after 
unlink...\n", path);
+   unlink(path);
+   ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
+   if (ret == -1) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "bind fd: %d to file: %s failed even 
after unlink\n",
+   sockfd, path);
+   goto err;
+   }
+   } else {
+   RTE_LOG(INFO, VHOST_CONFIG,
+   "socket: %s is alive, remove it and try 
again\n", path);
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "bind fd: %d to file: %s failed\n", sockfd, 
path);
+   goto err;
+   }
}
-   RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+   RTE_LOG(INFO, VHOST_CONFIG,
+   "bind fd: %d to file: %s successful\n", sockfd, path);

ret = listen(sockfd, MAX_VIRTIO_BACKLOG);
if (ret == -1)
-- 
2.5.0

[dpdk-dev] [PATCH 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-23 Thread Zhihong Wang

This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/testpmd.c | 23 +++
 examples/l2fwd/main.c  | 25 +
 examples/l3fwd/main.c  | 25 +
 3 files changed, 73 insertions(+)

-- 
2.5.0

[dpdk-dev] [PATCH 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-23 Thread Zhihong Wang

Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/testpmd.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..c259ba3 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1573,6 +1573,7 @@ pmd_test_exit(void)
FOREACH_PORT(pt_id, ports) {
printf("Stopping port %d...", pt_id);
fflush(stdout);
+   rte_eth_dev_stop(pt_id);
rte_eth_dev_close(pt_id);
printf("done\n");
}
@@ -1984,12 +1985,34 @@ init_port(void)
ports[pid].enabled = 1;
 }

+/* When we receive a INT signal, close all ports */
+static void
+sigint_handler(__rte_unused int signum)
+{
+   unsigned portid;
+
+   printf("Preparing to exit...\n");
+   FOREACH_PORT(portid, ports) {
+   if (port_id_is_invalid(portid, ENABLED_WARN))
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+   exit(0);
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
-- 
2.5.0

[dpdk-dev] [PATCH 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-23 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l2fwd/main.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..0594037 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -534,6 +535,27 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+/* When we receive a INT signal, close all ports */
+static void
+sigint_handler(__rte_unused int signum)
+{
+   unsigned portid, nb_ports;
+
+   printf("Preparing to exit...\n");
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+   exit(0);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -546,6 +568,9 @@ main(int argc, char **argv)
unsigned lcore_id, rx_lcore_id;
unsigned nb_ports_in_mask = 0;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
-- 
2.5.0

[dpdk-dev] [PATCH 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-23 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l3fwd/main.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..aae16d2 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -2559,6 +2560,27 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+/* When we receive a INT signal, close all ports */
+static void
+sigint_handler(__rte_unused int signum)
+{
+   unsigned portid, nb_ports;
+
+   printf("Preparing to exit...\n");
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+   exit(0);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2572,6 +2594,9 @@ main(int argc, char **argv)
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
-- 
2.5.0

[dpdk-dev] [PATCH v2 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-24 Thread Zhihong Wang

This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c |  19 ++---
 app/test-pmd/testpmd.c |  38 ++---
 app/test-pmd/testpmd.h |   1 +
 examples/l2fwd/main.c  |  60 +++
 examples/l3fwd/main.c  | 110 -
 5 files changed, 196 insertions(+), 32 deletions(-)

-- 
2.5.0

[dpdk-dev] [PATCH v2 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-24 Thread Zhihong Wang

Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
---
 app/test-pmd/cmdline.c | 19 +--
 app/test-pmd/testpmd.c | 38 --
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..4ff1739 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,22 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL) {
return;
}
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..cb38d56 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   FOREACH_PORT(pt_id, ports) {
+   printf("Stopping port %d...", pt_id);
+   fflush(stdout);
+   rte_eth_dev_stop(pt_id);
+   rte_eth_dev_close(pt_id);
+   printf(" Done\n");
+   }
}
-   printf("bye...\n");
+   printf("Bye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,34 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+sigint_handler(__rte_unused int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, sigint_handler);
+   signal(SIGTERM, sigint_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2066,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, &c, 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0

[dpdk-dev] [PATCH v2 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-24 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l2fwd/main.c | 60 +++
 1 file changed, 60 insertions(+)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..75899dd 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +71,9 @@
 #include 
 #include 

+static int force_quit = -1;
+static int signo_quit = -1;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -284,6 +289,8 @@ l2fwd_main_loop(void)
}

while (1) {
+   if (unlikely(force_quit != 0))
+   break;

cur_tsc = rte_rdtsc();

@@ -534,6 +541,45 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+}
+
+static void
+signal_handler(__rte_unused int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (force_quit < 0) {
+   printf("Forwarding not started yet...\n");
+   /* stop ports */
+   stop_ports();
+   printf("Bye...\n");
+   /* inform if there's a caller */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   } else {
+   printf("Forwarding started already...\n");
+   signo_quit = signum;
+   force_quit = 1;
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -546,6 +592,9 @@ main(int argc, char **argv)
unsigned lcore_id, rx_lcore_id;
unsigned nb_ports_in_mask = 0;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
@@ -697,11 +746,22 @@ main(int argc, char **argv)
check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);

/* launch per-lcore init on every lcore */
+   force_quit = 0;
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
if (rte_eal_wait_lcore(lcore_id) < 0)
return -1;
}

+   printf("Stopping forwarding... Done\n");
+   /* stop ports */
+   stop_ports();
+   printf("Bye...\n");
+   /* inform if there's a caller */
+   if (force_quit != 0) {
+   signal(signo_quit, SIG_DFL);
+   kill(getpid(), signo_quit);
+   }
+
return 0;
 }
-- 
2.5.0

[dpdk-dev] [PATCH v2 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-24 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
---
 examples/l3fwd/main.c | 110 +-
 1 file changed, 90 insertions(+), 20 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..b9f3232 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +77,9 @@
 #include 
 #include 

+static int force_quit = -1;
+static int signo_quit = -1;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1554,6 +1559,8 @@ main_loop(__attribute__((unused)) void *dummy)
}

while (1) {
+   if (unlikely(force_quit != 0))
+   break;

cur_tsc = rte_rdtsc();

@@ -2559,6 +2566,74 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+start_ports(void)
+{
+   unsigned portid, nb_ports;
+   int ret;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start: err=%d, port=%d\n",
+   ret, portid);
+   /*
+* If enabled, put device in promiscuous mode.
+* This allows IO forwarding mode to forward packets
+* to itself through 2 cross-connected  ports of the
+* target machine.
+*/
+   if (promiscuous_on)
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0) {
+   continue;
+   }
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+}
+
+static void
+signal_handler(__rte_unused int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (force_quit < 0) {
+   printf("Forwarding not started yet...\n");
+   /* stop ports */
+   stop_ports();
+   printf("Bye...\n");
+   /* inform if there's a caller */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   } else {
+   printf("Forwarding started already...\n");
+   signo_quit = signum;
+   force_quit = 1;
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2572,6 +2647,9 @@ main(int argc, char **argv)
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
@@ -2711,34 +2789,26 @@ main(int argc, char **argv)
printf("\n");

/* start ports */
-   for (portid = 0; portid < nb_ports; portid++) {
-   if ((enabled_port_mask & (1 << portid)) == 0) {
-   continue;
-   }
-   /* Start device */
-   ret = rte_eth_dev_start(portid);
-   if (ret < 0)
-   rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, 
port=%d\n",
-   ret, portid);
-
-   /*
-* If enabled, put device in promiscuous mode.
-* This allows IO forwarding mode to forward packets
-* to itself through 2 cross-connected  ports of the
-* target machine.
-*/
-   if (promiscuous_on)
-   rte_eth_promiscuous_enable(portid);
-   }
-
+   start_ports();
check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);

/* launch per-lcore init on every lcore */
+   force_quit = 0;
rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);
RTE_LCORE_FOREA

[dpdk-dev] [PATCH v3 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-28 Thread Zhihong Wang

This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v3:

1. Make sure correct port operations regarding status

2. Small fixes to make the code clearer

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c |  20 +---
 app/test-pmd/testpmd.c |  39 ---
 app/test-pmd/testpmd.h |   1 +
 examples/l2fwd/main.c  | 123 +-
 examples/l3fwd/main.c  | 129 -
 5 files changed, 255 insertions(+), 57 deletions(-)

-- 
2.5.0

[dpdk-dev] [PATCH v3 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-28 Thread Zhihong Wang

Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..6d28c1b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,21 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL)
return;
-   }
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..1319917 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   no_link_check = 1;
+   FOREACH_PORT(pt_id, ports) {
+   printf("\nShutting down port %d...\n", pt_id);
+   fflush(stdout);
+   stop_port(pt_id);
+   close_port(pt_id);
+   }
}
-   printf("bye...\n");
+   printf("\nBye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,35 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2067,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, &c, 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0

[dpdk-dev] [PATCH v3 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-28 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l2fwd/main.c | 123 +-
 1 file changed, 101 insertions(+), 22 deletions(-)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..ecd5d2b 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +72,10 @@
 #include 
 #include 

+static volatile bool port_started;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -283,8 +290,7 @@ l2fwd_main_loop(void)
portid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -491,8 +497,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
@@ -534,18 +544,85 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   port_started = true;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start:err=%d, port=%u\n",
+ ret, (unsigned) portid);
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   port_started = false;
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (port_started) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   } else {
+   printf("Ports not started yet...\n");
+   printf("Bye...\n");
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
struct lcore_queue_conf *qconf;
struct rte_eth_dev_info dev_info;
int ret;
-   uint8_t nb_ports;
-   uint8_t nb_ports_available;
+   uint8_t nb_ports, avail_ports;
uint8_t portid, last_port;
unsigned lcore_id, rx_lcore_id;
unsigned nb_ports_in_mask = 0;

+   port_started = false;
+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
@@ -627,14 +704,11 @@ main(int argc, char **argv)
printf("Lcore %u: RX port %u\n", rx_lcore_id, (unsigned) 
portid);
}

-   nb_ports_available = nb_ports;
-
/* Initialise each port */
for (portid = 0; portid < nb_ports; portid++) {
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
printf("Skipping disabled port %u\n", (unsigned) 
portid);
-   nb_ports_available--;
continue;

[dpdk-dev] [PATCH v3 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-28 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l3fwd/main.c | 129 +-
 1 file changed, 107 insertions(+), 22 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..c766cf5 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +78,10 @@
 #include 
 #include 

+static volatile bool port_started;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1553,8 +1560,7 @@ main_loop(__attribute__((unused)) void *dummy)
portid, queueid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -2516,8 +2522,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
@@ -2559,6 +2569,76 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   port_started = true;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start: err=%d, port=%d\n",
+   ret, portid);
+   /*
+* If enabled, put device in promiscuous mode.
+* This allows IO forwarding mode to forward packets
+* to itself through 2 cross-connected  ports of the
+* target machine.
+*/
+   if (promiscuous_on)
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   port_started = false;
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (port_started) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   } else {
+   printf("Ports not started yet...\n");
+   printf("Bye...\n");
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2571,6 +2651,12 @@ main(int argc, char **argv)
unsigned lcore_id;
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid;
+   uint8_t avail_ports;
+
+   port_started = false;
+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);

/* init EAL */
ret = rte_eal_init(argc, argv);
@@ -2711,34 +2797,33 @@ main(int argc, char **argv)
printf("\n");

/* start ports */
-   for (portid = 0; portid < nb_ports; portid++) {
-   if ((enabled_port_mask & (1 << portid)) == 0) {
-   continue;
-   }
-   /* Start d

[dpdk-dev] [PATCH v4 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-29 Thread Zhihong Wang

This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v4:

1. Add port status control in l2fwd and l3fwd

--
Changes in v3:

1. Make sure correct port operations regarding status

2. Small fixes to make the code clearer

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c |  20 +++---
 app/test-pmd/testpmd.c |  39 ++--
 app/test-pmd/testpmd.h |   1 +
 examples/l2fwd/main.c  | 161 ---
 examples/l3fwd/main.c  | 167 ++---
 5 files changed, 331 insertions(+), 57 deletions(-)

-- 
2.5.0

[dpdk-dev] [PATCH v4 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-29 Thread Zhihong Wang

Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..6d28c1b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,21 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL)
return;
-   }
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..1319917 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   no_link_check = 1;
+   FOREACH_PORT(pt_id, ports) {
+   printf("\nShutting down port %d...\n", pt_id);
+   fflush(stdout);
+   stop_port(pt_id);
+   close_port(pt_id);
+   }
}
-   printf("bye...\n");
+   printf("\nBye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,35 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2067,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, &c, 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0

[dpdk-dev] [PATCH v4 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-29 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l2fwd/main.c | 161 +++---
 1 file changed, 139 insertions(+), 22 deletions(-)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..9a6f80b 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +72,16 @@
 #include 
 #include 

+#define PORT_IDLE 0
+#define PORT_INIT 1
+#define PORT_WORK 2
+#define PORT_STOP 3
+#define PORT_QUIT 4
+
+static volatile uint32_t port_status;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -283,8 +296,7 @@ l2fwd_main_loop(void)
portid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -491,8 +503,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
@@ -534,18 +550,110 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_IDLE, PORT_INIT) == 0) {
+   printf("Ports not idle...\n");
+   return 0;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start:err=%d, port=%u\n",
+   ret, (unsigned) portid);
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   if (avail_ports) {
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_INIT, PORT_WORK) == 0)
+   printf("Set port state failed!\n");
+   } else {
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_INIT, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_WORK, PORT_STOP) == 0) {
+   printf("Ports not started...\n");
+   return;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_STOP, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_IDLE, PORT_QUIT) == 0) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+   force_quit = true;
+   } else {
+   printf("Ports not started yet...\n");
+   printf("Bye...\n");
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+   }
+}
+

[dpdk-dev] [PATCH v4 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-29 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l3fwd/main.c | 167 +++---
 1 file changed, 145 insertions(+), 22 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..f73d2a4 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +78,16 @@
 #include 
 #include 

+#define PORT_IDLE 0
+#define PORT_INIT 1
+#define PORT_WORK 2
+#define PORT_STOP 3
+#define PORT_QUIT 4
+
+static volatile uint32_t port_status;
+static volatile bool force_quit;
+static volatile int signo_quit;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1553,8 +1566,7 @@ main_loop(__attribute__((unused)) void *dummy)
portid, queueid);
}

-   while (1) {
-
+   while (!force_quit) {
cur_tsc = rte_rdtsc();

/*
@@ -2516,8 +2528,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
@@ -2559,6 +2575,101 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static uint8_t
+start_ports(void)
+{
+   unsigned portid, nb_ports, avail_ports;
+   int ret;
+
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_IDLE, PORT_INIT) == 0) {
+   printf("Ports not idle...\n");
+   return 0;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   avail_ports = 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   avail_ports++;
+   printf("Starting port %d...", portid);
+   ret = rte_eth_dev_start(portid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_eth_dev_start: err=%d, port=%d\n",
+   ret, portid);
+   /*
+* If enabled, put device in promiscuous mode.
+* This allows IO forwarding mode to forward packets
+* to itself through 2 cross-connected  ports of the
+* target machine.
+*/
+   if (promiscuous_on)
+   rte_eth_promiscuous_enable(portid);
+   printf(" Done\n");
+   }
+
+   if (avail_ports) {
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_INIT, PORT_WORK) == 0)
+   printf("Set port state failed!\n");
+   } else {
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_INIT, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+   }
+
+   return avail_ports;
+}
+
+static void
+stop_ports(void)
+{
+   unsigned portid, nb_ports;
+
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_WORK, PORT_STOP) == 0) {
+   printf("Ports not started...\n");
+   return;
+   }
+
+   nb_ports = rte_eth_dev_count();
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Stopping port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_STOP, PORT_IDLE) == 0)
+   printf("Set port state failed!\n");
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   if (rte_atomic32_cmpset(&port_status,
+   PORT_IDLE, PORT_QUIT) == 0) {
+   printf("Ports started already...\n");
+   signo_quit = signum;
+

[dpdk-dev] [PATCH v5 0/3] Handle SIGINT and SIGTERM in DPDK examples

2015-12-30 Thread Zhihong Wang

This patch handles SIGINT and SIGTERM in testpmd, l2fwd, and l3fwd, make sure 
all ports are properly stopped and closed.
For virtual ports, the stop and close function may deal with resource cleanup, 
such as socket files unlinking.

--
Changes in v5:

1. Get rid of over complicated logic in l2fwd and l3fwd

--
Changes in v4:

1. Add port status control in l2fwd and l3fwd

--
Changes in v3:

1. Make sure correct port operations regarding status

2. Small fixes to make the code clearer

--
Changes in v2:

1. Make sure graceful exit for all running phases

2. Make sure program exits with the right status

Zhihong Wang (3):
  app/test-pmd: Handle SIGINT and SIGTERM in testpmd
  examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd
  examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 examples/l2fwd/main.c  | 43 +++
 examples/l3fwd/main.c  | 46 ++
 5 files changed, 128 insertions(+), 21 deletions(-)

-- 
2.5.0

[dpdk-dev] [PATCH v5 1/3] app/test-pmd: Handle SIGINT and SIGTERM in testpmd

2015-12-30 Thread Zhihong Wang

Handle SIGINT and SIGTERM in testpmd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 app/test-pmd/cmdline.c | 20 +---
 app/test-pmd/testpmd.c | 39 +--
 app/test-pmd/testpmd.h |  1 +
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index 73298c9..6d28c1b 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -90,6 +90,8 @@

 #include "testpmd.h"

+static struct cmdline *testpmd_cl;
+
 static void cmd_reconfig_device_queue(portid_t id, uint8_t dev, uint8_t queue);

 #ifdef RTE_NIC_BYPASS
@@ -9778,17 +9780,21 @@ cmdline_parse_ctx_t main_ctx[] = {
 void
 prompt(void)
 {
-   struct cmdline *cl;
-
/* initialize non-constant commands */
cmd_set_fwd_mode_init();

-   cl = cmdline_stdin_new(main_ctx, "testpmd> ");
-   if (cl == NULL) {
+   testpmd_cl = cmdline_stdin_new(main_ctx, "testpmd> ");
+   if (testpmd_cl == NULL)
return;
-   }
-   cmdline_interact(cl);
-   cmdline_stdin_exit(cl);
+   cmdline_interact(testpmd_cl);
+   cmdline_stdin_exit(testpmd_cl);
+}
+
+void
+prompt_exit(void)
+{
+   if (testpmd_cl != NULL)
+   cmdline_quit(testpmd_cl);
 }

 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 98ae46d..1319917 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1570,13 +1570,16 @@ pmd_test_exit(void)
if (test_done == 0)
stop_packet_forwarding();

-   FOREACH_PORT(pt_id, ports) {
-   printf("Stopping port %d...", pt_id);
-   fflush(stdout);
-   rte_eth_dev_close(pt_id);
-   printf("done\n");
+   if (ports != NULL) {
+   no_link_check = 1;
+   FOREACH_PORT(pt_id, ports) {
+   printf("\nShutting down port %d...\n", pt_id);
+   fflush(stdout);
+   stop_port(pt_id);
+   close_port(pt_id);
+   }
}
-   printf("bye...\n");
+   printf("\nBye...\n");
 }

 typedef void (*cmd_func_t)(void);
@@ -1984,12 +1987,35 @@ init_port(void)
ports[pid].enabled = 1;
 }

+static void
+force_quit(void)
+{
+   pmd_test_exit();
+   prompt_exit();
+}
+
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit();
+   /* exit with the expected status */
+   signal(signum, SIG_DFL);
+   kill(getpid(), signum);
+   }
+}
+
 int
 main(int argc, char** argv)
 {
int  diag;
uint8_t port_id;

+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
diag = rte_eal_init(argc, argv);
if (diag < 0)
rte_panic("Cannot init EAL\n");
@@ -2041,6 +2067,7 @@ main(int argc, char** argv)
start_packet_forwarding(0);
printf("Press enter to exit\n");
rc = read(0, &c, 1);
+   pmd_test_exit();
if (rc < 0)
return 1;
}
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index ee7de98..7ffc17b 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -462,6 +462,7 @@ unsigned int parse_item_list(char* str, const char* 
item_name,
unsigned int *parsed_items, int check_unique_values);
 void launch_args_parse(int argc, char** argv);
 void prompt(void);
+void prompt_exit(void);
 void nic_stats_display(portid_t port_id);
 void nic_stats_clear(portid_t port_id);
 void nic_xstats_display(portid_t port_id);
-- 
2.5.0

[dpdk-dev] [PATCH v5 2/3] examples/l2fwd: Handle SIGINT and SIGTERM in l2fwd

2015-12-30 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l2fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l2fwd/main.c | 43 +++
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 720fd5a..f35d8a1 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -44,6 +44,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -69,6 +71,8 @@
 #include 
 #include 

+static volatile bool force_quit;
+
 #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1

 #define NB_MBUF   8192
@@ -283,7 +287,7 @@ l2fwd_main_loop(void)
portid);
}

-   while (1) {
+   while (!force_quit) {

cur_tsc = rte_rdtsc();

@@ -491,8 +495,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
@@ -534,6 +542,16 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\n\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit = true;
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -553,6 +571,10 @@ main(int argc, char **argv)
argc -= ret;
argv += ret;

+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* parse application arguments (after the EAL ones) */
ret = l2fwd_parse_args(argc, argv);
if (ret < 0)
@@ -696,12 +718,25 @@ main(int argc, char **argv)

check_all_ports_link_status(nb_ports, l2fwd_enabled_port_mask);

+   ret = 0;
/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-   if (rte_eal_wait_lcore(lcore_id) < 0)
-   return -1;
+   if (rte_eal_wait_lcore(lcore_id) < 0) {
+   ret = -1;
+   break;
+   }
}

-   return 0;
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Closing port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+
+   return ret;
 }
-- 
2.5.0

[dpdk-dev] [PATCH v5 3/3] examples/l3fwd: Handle SIGINT and SIGTERM in l3fwd

2015-12-30 Thread Zhihong Wang

Handle SIGINT and SIGTERM in l3fwd.

Signed-off-by: Zhihong Wang 
Acked-by: Michael Qiu 
---
 examples/l3fwd/main.c | 46 ++
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 5b0c2dd..21a5782 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -41,6 +41,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 

 #include 
 #include 
@@ -75,6 +77,8 @@
 #include 
 #include 

+static volatile bool force_quit;
+
 #define APP_LOOKUP_EXACT_MATCH  0
 #define APP_LOOKUP_LPM  1
 #define DO_RFC_1812_CHECKS
@@ -1553,7 +1557,7 @@ main_loop(__attribute__((unused)) void *dummy)
portid, queueid);
}

-   while (1) {
+   while (!force_quit) {

cur_tsc = rte_rdtsc();

@@ -1781,6 +1785,8 @@ main_loop(__attribute__((unused)) void *dummy)

}
}
+
+   return 0;
 }

 static int
@@ -2516,8 +2522,12 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
printf("\nChecking link status");
fflush(stdout);
for (count = 0; count <= MAX_CHECK_TIME; count++) {
+   if (force_quit)
+   return;
all_ports_up = 1;
for (portid = 0; portid < port_num; portid++) {
+   if (force_quit)
+   return;
if ((port_mask & (1 << portid)) == 0)
continue;
memset(&link, 0, sizeof(link));
@@ -2559,6 +2569,16 @@ check_all_ports_link_status(uint8_t port_num, uint32_t 
port_mask)
}
 }

+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM) {
+   printf("\n\nSignal %d received, preparing to exit...\n",
+   signum);
+   force_quit = true;
+   }
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2579,6 +2599,10 @@ main(int argc, char **argv)
argc -= ret;
argv += ret;

+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
/* pre-init dst MACs for all ports to 02:00:00:00:00:xx */
for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) {
dest_eth_addr[portid] = ETHER_LOCAL_ADMIN_ADDR + 
((uint64_t)portid << 40);
@@ -2733,12 +2757,26 @@ main(int argc, char **argv)

check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask);

+   ret = 0;
/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-   if (rte_eal_wait_lcore(lcore_id) < 0)
-   return -1;
+   if (rte_eal_wait_lcore(lcore_id) < 0) {
+   ret = -1;
+   break;
+   }
}

-   return 0;
+   /* stop ports */
+   for (portid = 0; portid < nb_ports; portid++) {
+   if ((enabled_port_mask & (1 << portid)) == 0)
+   continue;
+   printf("Closing port %d...", portid);
+   rte_eth_dev_stop(portid);
+   rte_eth_dev_close(portid);
+   printf(" Done\n");
+   }
+   printf("Bye...\n");
+
+   return ret;
 }
-- 
2.5.0

[dpdk-dev] [PATCH] doc: add doc for vdpa

2018-05-25 Thread Zhihong Wang

Signed-off-by: Zhihong Wang 
---
 doc/guides/prog_guide/vhost_lib.rst | 59 +
 1 file changed, 59 insertions(+)

diff --git a/doc/guides/prog_guide/vhost_lib.rst 
b/doc/guides/prog_guide/vhost_lib.rst
index 92dcdb587..77af4d775 100644
--- a/doc/guides/prog_guide/vhost_lib.rst
+++ b/doc/guides/prog_guide/vhost_lib.rst
@@ -274,3 +274,62 @@ Vhost supported vSwitch reference
 
 For more vhost details and how to support vhost in vSwitch, please refer to
 the vhost example in the DPDK Sample Applications Guide.
+
+Vhost data path acceleration (vDPA)
+---
+
+vDPA supports selective datapath in vhost-user lib by enabling virtio ring
+compatible devices to serve virtio driver directly for datapath acceleration.
+
+``rte_vhost_driver_attach_vdpa_device`` is used to configure the vhost device
+with accelerated backend.
+
+Also vhost device capabilities are made configurable to adopt various devices.
+Such capabilities include supported features, protocol features, queue number.
+
+Finally, a set of device ops is defined for device specific operations:
+
+* ``get_queue_num``
+
+  Called to get supported queue number of the device.
+
+* ``get_features``
+
+  Called to get supported features of the device.
+
+* ``get_protocol_features``
+
+  Called to get supported protocol features of the device.
+
+* ``dev_conf``
+
+  Called to configure the actual device when the virtio device becomes ready.
+
+* ``dev_close``
+
+  Called to close the actual device when the virtio device is stopped.
+
+* ``set_vring_state``
+
+  Called to change the state of the vring in the actual device when vring state
+  changes.
+
+* ``set_features``
+
+  Called to set the negotiated features to device.
+
+* ``migration_done``
+
+  Called to allow the device to response to RARP sending.
+
+* ``get_vfio_group_fd``
+
+   Called to get the VFIO group fd of the device.
+
+* ``get_vfio_device_fd``
+
+  Called to get the VFIO device fd of the device.
+
+* ``get_notify_area``
+
+  Called to get the notify area info of the queue.
-- 
2.13.6

[dpdk-dev] [PATCH] doc: update release notes for vhost interrupt mode

2018-05-25 Thread Zhihong Wang

Signed-off-by: Zhihong Wang 
---
 doc/guides/rel_notes/release_18_05.rst | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/doc/guides/rel_notes/release_18_05.rst 
b/doc/guides/rel_notes/release_18_05.rst
index 40eec3a49..eda6cf411 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -257,6 +257,12 @@ New Features
   on eth devices (right now only via SW RX/TX callbacks).
   It also adds dependency on libelf.
 
+* **Added support for vhost dequeue interrupt mode.**
+
+  Added support for vhost dequeue interrupt mode to release cpus to others when
+  no data to transmit. Applications could register an epoll event fd to 
associate
+  Rx queues with interrupt vectors.
+
 
 API Changes
 ---
-- 
2.13.6

[dpdk-dev] [PATCH] doc: update release notes for vdpa

2018-05-25 Thread Zhihong Wang

Signed-off-by: Zhihong Wang 
---
 doc/guides/rel_notes/release_18_05.rst | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/doc/guides/rel_notes/release_18_05.rst 
b/doc/guides/rel_notes/release_18_05.rst
index 40eec3a49..44180bb44 100644
--- a/doc/guides/rel_notes/release_18_05.rst
+++ b/doc/guides/rel_notes/release_18_05.rst
@@ -257,6 +257,12 @@ New Features
   on eth devices (right now only via SW RX/TX callbacks).
   It also adds dependency on libelf.
 
+* **Added vDPA in vhost-user lib.**
+
+  Added support for selective datapath in vhost-user lib. vDPA stands for vhost
+  Data Path Acceleration. It supports virtio ring compatible devices to serve
+  virtio driver directly to enable datapath acceleration.
+
 
 API Changes
 ---
-- 
2.13.6

[dpdk-dev] [PATCH v2 0/4] DPDK memcpy optimization

2015-01-29 Thread Zhihong Wang

This patch set optimizes memcpy for DPDK for both SSE and AVX platforms.
It also extends memcpy test coverage with unaligned cases and more test points.

Optimization techniques are summarized below:

1. Utilize full cache bandwidth

2. Enforce aligned stores

3. Apply load address alignment based on architecture features

4. Make load/store address available as early as possible

5. General optimization techniques like inlining, branch reducing, prefetch 
pattern access

--
Changes in v2:

1. Reduced constant test cases in app/test/test_memcpy_perf.c for fast build

2. Modified macro definition for better code readability & safety

Zhihong Wang (4):
  app/test: Disabled VTA for memcpy test in app/test/Makefile
  app/test: Removed unnecessary test cases in app/test/test_memcpy.c
  app/test: Extended test coverage in app/test/test_memcpy_perf.c
  lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE
and AVX platforms

 app/test/Makefile  |   6 +
 app/test/test_memcpy.c |  52 +-
 app/test/test_memcpy_perf.c| 220 ---
 .../common/include/arch/x86/rte_memcpy.h   | 680 +++--
 4 files changed, 654 insertions(+), 304 deletions(-)

-- 
1.9.3

[dpdk-dev] [PATCH v2 3/4] app/test: Extended test coverage in app/test/test_memcpy_perf.c

2015-01-29 Thread Zhihong Wang

Main code changes:

1. Added more typical data points for a thorough performance test

2. Added unaligned test cases since it's common in DPDK usage

Signed-off-by: Zhihong Wang 
---
 app/test/test_memcpy_perf.c | 220 +++-
 1 file changed, 138 insertions(+), 82 deletions(-)

diff --git a/app/test/test_memcpy_perf.c b/app/test/test_memcpy_perf.c
index 7809610..754828e 100644
--- a/app/test/test_memcpy_perf.c
+++ b/app/test/test_memcpy_perf.c
@@ -54,9 +54,10 @@
 /* List of buffer sizes to test */
 #if TEST_VALUE_RANGE == 0
 static size_t buf_sizes[] = {
-   0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
-   256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
-   2048, 3072, 4096, 5120, 6144, 7168, 8192
+   1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 
128,
+   129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 
448,
+   449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 
1600,
+   2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 
8192
 };
 /* MUST be as large as largest packet size above */
 #define SMALL_BUFFER_SIZE   8192
@@ -78,7 +79,7 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE 100

 /* Data is aligned on this many bytes (power of 2) */
-#define ALIGNMENT_UNIT  16
+#define ALIGNMENT_UNIT  32

 /*
  * Pointers used in performance tests. The two large buffers are for uncached
@@ -94,19 +95,19 @@ init_buffers(void)
 {
unsigned i;

-   large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_read == NULL)
goto error_large_buf_read;

-   large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_write == NULL)
goto error_large_buf_write;

-   small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (small_buf_read == NULL)
goto error_small_buf_read;

-   small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (small_buf_write == NULL)
goto error_small_buf_write;

@@ -140,25 +141,25 @@ free_buffers(void)

 /*
  * Get a random offset into large array, with enough space needed to perform
- * max copy size. Offset is aligned.
+ * max copy size. Offset is aligned, uoffset is used for unalignment setting.
  */
 static inline size_t
-get_rand_offset(void)
+get_rand_offset(size_t uoffset)
 {
-   return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
-   ~(ALIGNMENT_UNIT - 1));
+   return (((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
+   ~(ALIGNMENT_UNIT - 1)) + uoffset);
 }

 /* Fill in source and destination addresses. */
 static inline void
-fill_addr_arrays(size_t *dst_addr, int is_dst_cached,
-   size_t *src_addr, int is_src_cached)
+fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
+size_t *src_addr, int is_src_cached, size_t 
src_uoffset)
 {
unsigned int i;

for (i = 0; i < TEST_BATCH_SIZE; i++) {
-   dst_addr[i] = (is_dst_cached) ? 0 : get_rand_offset();
-   src_addr[i] = (is_src_cached) ? 0 : get_rand_offset();
+   dst_addr[i] = (is_dst_cached) ? dst_uoffset : 
get_rand_offset(dst_uoffset);
+   src_addr[i] = (is_src_cached) ? src_uoffset : 
get_rand_offset(src_uoffset);
}
 }

@@ -169,16 +170,17 @@ fill_addr_arrays(size_t *dst_addr, int is_dst_cached,
  */
 static void
 do_uncached_write(uint8_t *dst, int is_dst_cached,
-   const uint8_t *src, int is_src_cached, size_t size)
+ const uint8_t *src, int is_src_cached, size_t 
size)
 {
unsigned i, j;
size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];

for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
-   fill_addr_arrays(dst_addrs, is_dst_cached,
-src_addrs, is_src_cached);
-   for (j = 0; j < TEST_BATCH_SIZE; j++)
+   fill_addr_arrays(dst_addrs, is_dst_cached, 0,
+src_addrs, is_src_cached, 0);
+   for (j = 0; j < TEST_BATCH_SIZE; j++) {

[dpdk-dev] [PATCH v2 1/4] app/test: Disabled VTA for memcpy test in app/test/Makefile

2015-01-29 Thread Zhihong Wang

VTA is for debugging only, it increases compile time and binary size, 
especially when there're a lot of inlines.
So disable it since memcpy test contains a lot of inline calls.

Signed-off-by: Zhihong Wang 
---
 app/test/Makefile | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/app/test/Makefile b/app/test/Makefile
index 4311f96..94dbadf 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -143,6 +143,12 @@ CFLAGS_test_kni.o += -Wno-deprecated-declarations
 endif
 CFLAGS += -D_GNU_SOURCE

+# Disable VTA for memcpy test
+ifeq ($(CC), gcc)
+CFLAGS_test_memcpy.o += -fno-var-tracking-assignments
+CFLAGS_test_memcpy_perf.o += -fno-var-tracking-assignments
+endif
+
 # this application needs libraries first
 DEPDIRS-y += lib

-- 
1.9.3

[dpdk-dev] [PATCH v2 2/4] app/test: Removed unnecessary test cases in app/test/test_memcpy.c

2015-01-29 Thread Zhihong Wang

Removed unnecessary test cases for base move functions since the function 
"func_test" covers them all.

Signed-off-by: Zhihong Wang 
---
 app/test/test_memcpy.c | 52 +-
 1 file changed, 1 insertion(+), 51 deletions(-)

diff --git a/app/test/test_memcpy.c b/app/test/test_memcpy.c
index 56b8e1e..b2bb4e0 100644
--- a/app/test/test_memcpy.c
+++ b/app/test/test_memcpy.c
@@ -78,56 +78,9 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE 100

 /* Data is aligned on this many bytes (power of 2) */
-#define ALIGNMENT_UNIT  16
+#define ALIGNMENT_UNIT  32


-
-/* Structure with base memcpy func pointer, and number of bytes it copies */
-struct base_memcpy_func {
-   void (*func)(uint8_t *dst, const uint8_t *src);
-   unsigned size;
-};
-
-/* To create base_memcpy_func structure entries */
-#define BASE_FUNC(n) {rte_mov##n, n}
-
-/* Max number of bytes that can be copies with a "base" memcpy functions */
-#define MAX_BASE_FUNC_SIZE 256
-
-/*
- * Test the "base" memcpy functions, that a copy fixed number of bytes.
- */
-static int
-base_func_test(void)
-{
-   const struct base_memcpy_func base_memcpy_funcs[6] = {
-   BASE_FUNC(16),
-   BASE_FUNC(32),
-   BASE_FUNC(48),
-   BASE_FUNC(64),
-   BASE_FUNC(128),
-   BASE_FUNC(256),
-   };
-   unsigned i, j;
-   unsigned num_funcs = sizeof(base_memcpy_funcs) / 
sizeof(base_memcpy_funcs[0]);
-   uint8_t dst[MAX_BASE_FUNC_SIZE];
-   uint8_t src[MAX_BASE_FUNC_SIZE];
-
-   for (i = 0; i < num_funcs; i++) {
-   unsigned size = base_memcpy_funcs[i].size;
-   for (j = 0; j < size; j++) {
-   dst[j] = 0;
-   src[j] = (uint8_t) rte_rand();
-   }
-   base_memcpy_funcs[i].func(dst, src);
-   for (j = 0; j < size; j++)
-   if (dst[j] != src[j])
-   return -1;
-   }
-
-   return 0;
-}
-
 /*
  * Create two buffers, and initialise one with random values. These are copied
  * to the second buffer and then compared to see if the copy was successful.
@@ -218,9 +171,6 @@ test_memcpy(void)
ret = func_test();
if (ret != 0)
return -1;
-   ret = base_func_test();
-   if (ret != 0)
-   return -1;
return 0;
 }

-- 
1.9.3

[dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-29 Thread Zhihong Wang

Main code changes:

1. Differentiate architectural features based on CPU flags

a. Implement separated move functions for SSE/AVX/AVX2 to make full 
utilization of cache bandwidth

b. Implement separated copy flow specifically optimized for target 
architecture

2. Rewrite the memcpy function "rte_memcpy"

a. Add store aligning

b. Add load aligning based on architectural features

c. Put block copy loop into inline move functions for better control of 
instruction order

d. Eliminate unnecessary MOVs

3. Rewrite the inline move functions

a. Add move functions for unaligned load cases

b. Change instruction order in copy loops for better pipeline utilization

c. Use intrinsics instead of assembly code

4. Remove slow glibc call for constant copies

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 680 +++--
 1 file changed, 509 insertions(+), 171 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index fb9eba8..7b2d382 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -34,166 +34,189 @@
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_

+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ */
+
+#include 
 #include 
 #include 
-#include 
+#include 

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "generic/rte_memcpy.h"
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));

-#ifdef __INTEL_COMPILER
-#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
-#endif
+#ifdef RTE_MACHINE_CPUFLAG_AVX2

+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   : [reg_a] "=x" (reg_a)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   __m128i xmm0;
+
+   xmm0 = _mm_loadu_si128((const __m128i *)src);
+   _mm_storeu_si128((__m128i *)dst, xmm0);
 }

+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
-}
+   __m256i ymm0;

-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-   __m128i reg_a, reg_b, reg_c;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   "movdqu %[reg_c], 32(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b),
- [reg_c] "=x" (reg_c)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   ymm0 = _mm256_loadu_si256((const __m256i *)src);
+   _mm256_storeu_si256((__m256i *)dst, ymm0);
 }

+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b, reg_c, reg_d;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu 4

[dpdk-dev] [PATCH] vhost: fix ANY_LAYOUT declaration

2018-01-18 Thread Zhihong Wang

The VIRTIO_F_ANY_LAYOUT feature indicates the device accepts arbitrary
descriptor layouts. The vhost-user lib already supports it, but the 
feature declaration is missing. This patch fixes the mismatch.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index b2bf0e8..57a9bea 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -170,6 +170,7 @@ struct vhost_msg {
 
 /* Features supported by this builtin vhost-user net driver. */
 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+   (1ULL << VIRTIO_F_ANY_LAYOUT) | \
(1ULL << VIRTIO_NET_F_CTRL_VQ) | \
(1ULL << VIRTIO_NET_F_CTRL_RX) | \
(1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
-- 
2.7.5

[dpdk-dev] [PATCH v3 0/5] vhost: support selective datapath

2018-03-19 Thread Zhihong Wang

This patch set introduces support for selective datapath in DPDK vhost-user
lib. vDPA stands for vhost Data Path Acceleration. The idea is to enable
various types of virtio-compatible devices to do data transfer with virtio
driver directly to enable acceleration.

The default datapath is the existing software implementation, more options
will be available when new engines are added.

Design details


An engine is a group of virtio-compatible devices. The definition of engine
is as follows:

struct rte_vdpa_eng_addr {
union {
uint8_t __dummy[64];
struct rte_pci_addr pci_addr;
};
};

struct rte_vdpa_eng_info {
char name[MAX_VDPA_NAME_LEN];
struct rte_vdpa_eng_addr *addr;
};

struct rte_vdpa_dev_ops {
vdpa_dev_conf_tdev_conf;
vdpa_dev_close_t   dev_close;
vdpa_vring_state_set_t vring_state_set;
vdpa_feature_set_t feature_set;
vdpa_migration_done_t  migration_done;
vdpa_get_vfio_group_fd_t  get_vfio_group_fd;
vdpa_get_vfio_device_fd_t get_vfio_device_fd;
vdpa_get_notify_area_tget_notify_area;
};

struct rte_vdpa_eng_ops {
vdpa_eng_init_t   eng_init;
vdpa_eng_uninit_t eng_uninit;
vdpa_info_query_t info_query;
};

struct rte_vdpa_eng_driver {
const char *name;
struct rte_vdpa_eng_ops eng_ops;
struct rte_vdpa_dev_ops dev_ops;
} __rte_cache_aligned;

struct rte_vdpa_engine {
struct rte_vdpa_eng_infoeng_info;
struct rte_vdpa_eng_driver *eng_drv;
} __rte_cache_aligned;

A set of engine ops is defined in rte_vdpa_eng_ops for engine init, uninit,
and attributes reporting. The attributes are defined as follows:

struct rte_vdpa_eng_attr {
uint64_t features;
uint64_t protocol_features;
uint32_t queue_num;
uint32_t dev_num;
};

A set of device ops is defined in rte_vdpa_dev_ops for each virtio device
in the engine to do device specific operations.

Changes to the current vhost-user lib are:


 1. Make vhost device capabilities configurable to adopt various engines.
Such capabilities include supported features, protocol features, queue
number. APIs are introduced to let app configure these capabilities.

 2. In addition to the existing vhost framework, a set of callbacks is
added for vhost to call the driver for device operations at the right
time:

 a. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 b. dev_close: Called to close the actual device when the virtio device
is stopped.

 c. vring_state_set: Called to change the state of the vring in the
actual device when vring state changes.

 d. feature_set: Called to set the negotiated features to device.

 e. migration_done: Called to allow the device to response to RARP
sending.

 f. get_vfio_group_fd: Called to get the VFIO group fd of the device.

 g. get_vfio_device_fd: Called to get the VFIO device fd of the device.

 h. get_notify_area: Called to get the notify area info of the queue.

 3. To make vhost aware of its own type, an engine id (eid) and a device
id (did) are added into the vhost data structure to identify the actual
device. APIs are introduced to let app configure them. When the default
software datapath is used, eid and did are set to -1. When alternative
datapath is used, eid and did are set by app to specify which device to
use. Each vhost-user socket can have only 1 connection in this case.

Working process:


 1. Register driver during DPDK initialization.

 2. Register engine with driver name and address.

 3. Get engine attributes.

 4. For vhost device creation:

  a. Register vhost-user socket.

  b. Set eid and did of the vhost-user socket.

  c. Register vhost-user callbacks.

  d. Start to wait for connection.

 4. When connection comes and virtio device data structure is negotiated,
the device will be configured with all needed info.

---
Changes in v3:

 1. Keep macro names the same as in the spec.

 2. Export new APIs where they're introduced.

---
Changes in v2:

 1. Ensure negotiated capabilities are supported in vhost-user lib.

 2. Add APIs for live migration.

 3. Configure the data path at the right time.

 4. Add VFIO related vDPA device ops.

 5. Rebase on dpdk-next-virtio.

Zhihong Wang (5):
  vhost: export vhost feature definitions
  vhost: support selective datapath
  vhost: add apis for datapath configuration
  vhost: adapt vhost lib for selective datapath
  vhost: add apis for live migration

 lib/librte_vhost/Makefile  |   4 +-
 lib/librte_vhost/rte_vdpa.h| 126 +++
 lib/librte_vhost/rte_vhost.h   | 178 +
 lib/librte_vhost/rte_vhost_version.map |  19 
 lib/librte_vhost/socket.c  | 141 +-

[dpdk-dev] [PATCH v3 3/5] vhost: add apis for datapath configuration

2018-03-19 Thread Zhihong Wang

This patch adds APIs for datapath configuration. The eid and did of the
vhost-user socket can be configured to identify the actual device.

When the default software datapath is used, eid and did are set to -1.
When alternative datapath is used, eid and did are set by app to specify
which device to use. Each vhost-user socket can have only 1 connection in
this case.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h   | 70 ++
 lib/librte_vhost/rte_vhost_version.map |  6 +++
 lib/librte_vhost/socket.c  | 65 +++
 lib/librte_vhost/vhost.c   | 50 
 lib/librte_vhost/vhost.h   | 10 +
 5 files changed, 201 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 671ea5053..7aa57ca87 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -200,6 +200,54 @@ int rte_vhost_driver_register(const char *path, uint64_t 
flags);
 int rte_vhost_driver_unregister(const char *path);
 
 /**
+ * Set the engine id, enforce single connection per socket
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param eid
+ *  Engine id
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_set_vdpa_eid(const char *path, int eid);
+
+/**
+ * Set the device id, enforce single connection per socket
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param did
+ *  Device id
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_set_vdpa_did(const char *path, int did);
+
+/**
+ * Get the engine id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  Engine id, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_vdpa_eid(const char *path);
+
+/**
+ * Get the device id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  Device id, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_vdpa_did(const char *path);
+
+/**
  * Set the feature bits the vhost-user driver supports.
  *
  * @param path
@@ -464,6 +512,28 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
  */
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
+/**
+ * Get vdpa engine id for vhost device.
+ *
+ * @param vid
+ *  vhost device ID
+ * @return
+ *  engine id
+ */
+int __rte_experimental
+rte_vhost_get_vdpa_eid(int vid);
+
+/**
+ * Get vdpa device id for vhost device.
+ *
+ * @param vid
+ *  vhost device ID
+ * @return
+ *  device id
+ */
+int __rte_experimental
+rte_vhost_get_vdpa_did(int vid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 57a3edd01..c505596c5 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -66,4 +66,10 @@ EXPERIMENTAL {
rte_vdpa_find_engine_id;
rte_vdpa_info_query;
rte_vdpa_register_driver;
+   rte_vhost_driver_set_vdpa_eid;
+   rte_vhost_driver_set_vdpa_did;
+   rte_vhost_driver_get_vdpa_eid;
+   rte_vhost_driver_get_vdpa_did;
+   rte_vhost_get_vdpa_eid;
+   rte_vhost_get_vdpa_did;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index cfc31e179..8551eb58c 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -52,6 +52,13 @@ struct vhost_user_socket {
uint64_t supported_features;
uint64_t features;
 
+   /* engine and device id to identify a certain port on a specific
+* backend, both are set to -1 for sw. when used, one socket can
+* have 1 connection only.
+*/
+   int eid;
+   int did;
+
struct vhost_device_ops const *notify_ops;
 };
 
@@ -545,6 +552,64 @@ find_vhost_user_socket(const char *path)
 }
 
 int
+rte_vhost_driver_set_vdpa_eid(const char *path, int eid)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->eid = eid;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_vdpa_did(const char *path, int did)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->did = did;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_vdpa_eid(const char *path)
+{
+   struct vhost_user_socket *vsocket;
+   int eid = -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   eid = vsocket->eid;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return eid;
+}
+
+int
+rt

[dpdk-dev] [PATCH v3 4/5] vhost: adapt vhost lib for selective datapath

2018-03-19 Thread Zhihong Wang

This patch adapts vhost lib for selective datapath by calling device ops
at the corresponding stage.

Signed-off-by: Zhihong Wang 
---
Changes in v2:

 1. Ensure negotiated capabilities are supported in vhost-user lib.

 2. Configure the data path at the right time.

 lib/librte_vhost/rte_vhost.h   | 27 
 lib/librte_vhost/rte_vhost_version.map |  2 +
 lib/librte_vhost/socket.c  | 76 +-
 lib/librte_vhost/vhost.c   |  3 ++
 lib/librte_vhost/vhost.h   |  2 +
 lib/librte_vhost/vhost_user.c  | 56 ++---
 6 files changed, 158 insertions(+), 8 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 7aa57ca87..77c2a1a8b 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -303,6 +303,33 @@ int rte_vhost_driver_disable_features(const char *path, 
uint64_t features);
 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
 
 /**
+ * Get the protocol feature bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param protocol_features
+ *  A pointer to store the queried protocol feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features);
+
+/**
+ * Get the queue number bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param queue_num
+ *  A pointer to store the queried queue number bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
+
+/**
  * Get the feature bits after negotiation
  *
  * @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index c505596c5..8ef2a396c 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -72,4 +72,6 @@ EXPERIMENTAL {
rte_vhost_driver_get_vdpa_did;
rte_vhost_get_vdpa_eid;
rte_vhost_get_vdpa_did;
+   rte_vhost_driver_get_protocol_features;
+   rte_vhost_driver_get_queue_num;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 8551eb58c..14dce2a73 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -216,6 +216,9 @@ vhost_user_add_connection(int fd, struct vhost_user_socket 
*vsocket)
 
vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
 
+   vhost_set_vdpa_eid(vid, vsocket->eid);
+   vhost_set_vdpa_did(vid, vsocket->did);
+
if (vsocket->dequeue_zero_copy)
vhost_enable_dequeue_zero_copy(vid);
 
@@ -677,11 +680,80 @@ int
 rte_vhost_driver_get_features(const char *path, uint64_t *features)
 {
struct vhost_user_socket *vsocket;
+   struct rte_vdpa_eng_attr attr;
+   int eid = -1;
 
pthread_mutex_lock(&vhost_user.mutex);
vsocket = find_vhost_user_socket(path);
-   if (vsocket)
-   *features = vsocket->features;
+   if (vsocket) {
+   eid = vsocket->eid;
+   if (rte_vdpa_info_query(eid, &attr) < 0)
+   *features = vsocket->features;
+   else
+   *features = vsocket->features & attr.features;
+
+   }
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   if (!vsocket) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "socket file %s is not registered yet.\n", path);
+   return -1;
+   } else {
+   return 0;
+   }
+}
+
+int
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features)
+{
+   struct vhost_user_socket *vsocket;
+   struct rte_vdpa_eng_attr attr;
+   int eid = -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket) {
+   eid = vsocket->eid;
+   if (rte_vdpa_info_query(eid, &attr) < 0)
+   *protocol_features = VHOST_USER_PROTOCOL_FEATURES;
+   else
+   *protocol_features = VHOST_USER_PROTOCOL_FEATURES
+   & attr.protocol_features;
+
+   }
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   if (!vsocket) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "socket file %s is not registered yet.\n", path);
+   return -1;
+   } else {
+   return 0;
+   }
+}
+
+int
+rte_vhost_driver_get_queue_num(const char *path,
+   uint32_t *queue_num)
+{
+   struct vhost_user_socket *vsocket;
+   struct rte_vdpa_eng_attr attr;
+   int eid = -1;
+
+   pthread_mutex_lock(&a

[dpdk-dev] [PATCH v3 1/5] vhost: export vhost feature definitions

2018-03-19 Thread Zhihong Wang

This patch exports vhost-user protocol features to support device driver
development.

Signed-off-by: Zhihong Wang 
---
Changes in v3:

 1. Keep macro names the same as in the spec.

 lib/librte_vhost/rte_vhost.h  | 30 ++
 lib/librte_vhost/vhost.h  |  2 --
 lib/librte_vhost/vhost_user.h |  7 ---
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d33206997..671ea5053 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -29,6 +29,36 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 
+/** Protocol features. */
+#ifndef VHOST_USER_PROTOCOL_F_MQ
+#define VHOST_USER_PROTOCOL_F_MQ   0
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_RARP
+#define VHOST_USER_PROTOCOL_F_RARP 2
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_NET_MTU
+#define VHOST_USER_PROTOCOL_F_NET_MTU  4
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ5
+#endif
+
+/** Indicate whether protocol features negotiation is supported. */
+#ifndef VHOST_USER_F_PROTOCOL_FEATURES
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#endif
+
 /**
  * Information relating to memory regions including offsets to
  * addresses in QEMUs memory file.
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 58aec2e0d..2e28e4026 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -174,8 +174,6 @@ struct vhost_msg {
  #define VIRTIO_F_VERSION_1 32
 #endif
 
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
 /* Features supported by this builtin vhost-user net driver. */
 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
(1ULL << VIRTIO_F_ANY_LAYOUT) | \
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 0fafbe6e0..97afabc47 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -14,13 +14,6 @@
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
-#define VHOST_USER_PROTOCOL_F_MQ   0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
-#define VHOST_USER_PROTOCOL_F_RARP 2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
-#define VHOST_USER_PROTOCOL_F_NET_MTU 4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-
 #define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
 (1ULL << 
VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | 
\
-- 
2.13.6

[dpdk-dev] [PATCH v3 2/5] vhost: support selective datapath

2018-03-19 Thread Zhihong Wang

This patch introduces support for selective datapath in DPDK vhost-user lib
to enable various types of virtio-compatible devices to do data transfer
with virtio driver directly to enable acceleration. The default datapath is
the existing software implementation, more options will be available when
new engines are registered.

An engine is a group of virtio-compatible devices under a single address.
The engine driver includes:

 1. A set of engine ops is defined in rte_vdpa_eng_ops to perform engine
init, uninit, and attributes reporting.

 2. A set of device ops is defined in rte_vdpa_dev_ops for virtio devices
in the engine to do device specific operations:

 a. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 b. dev_close: Called to close the actual device when the virtio device
is stopped.

 c. vring_state_set: Called to change the state of the vring in the
actual device when vring state changes.

 d. feature_set: Called to set the negotiated features to device.

 e. migration_done: Called to allow the device to response to RARP
sending.

 f. get_vfio_group_fd: Called to get the VFIO group fd of the device.

 g. get_vfio_device_fd: Called to get the VFIO device fd of the device.

 h. get_notify_area: Called to get the notify area info of the queue.

Signed-off-by: Zhihong Wang 
---
Changes in v2:

 1. Add VFIO related vDPA device ops.

 lib/librte_vhost/Makefile  |   4 +-
 lib/librte_vhost/rte_vdpa.h| 126 +
 lib/librte_vhost/rte_vhost_version.map |   8 +++
 lib/librte_vhost/vdpa.c| 124 
 4 files changed, 260 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vdpa.h
 create mode 100644 lib/librte_vhost/vdpa.c

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index 5d6c6abae..37044ac03 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -22,9 +22,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev 
-lrte_net
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
-   vhost_user.c virtio_net.c
+   vhost_user.c virtio_net.c vdpa.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
new file mode 100644
index 0..23fb471be
--- /dev/null
+++ b/lib/librte_vhost/rte_vdpa.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_VDPA_H_
+#define _RTE_VDPA_H_
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include 
+#include "rte_vhost.h"
+
+#define MAX_VDPA_ENGINE_NUM 128
+#define MAX_VDPA_NAME_LEN 128
+
+struct rte_vdpa_eng_addr {
+   union {
+   uint8_t __dummy[64];
+   struct rte_pci_addr pci_addr;
+   };
+};
+
+struct rte_vdpa_eng_info {
+   struct rte_vdpa_eng_addr *addr;
+   char name[MAX_VDPA_NAME_LEN];
+};
+
+struct rte_vdpa_eng_attr {
+   uint64_t features;
+   uint64_t protocol_features;
+   uint32_t queue_num;
+   uint32_t dev_num;
+};
+
+/* register/remove engine */
+typedef int (*vdpa_eng_init_t)(int eid, struct rte_vdpa_eng_addr *addr);
+typedef int (*vdpa_eng_uninit_t)(int eid);
+
+/* query info of this engine */
+typedef int (*vdpa_info_query_t)(int eid,
+   struct rte_vdpa_eng_attr *attr);
+
+/* driver configure/close the port based on connection */
+typedef int (*vdpa_dev_conf_t)(int vid);
+typedef int (*vdpa_dev_close_t)(int vid);
+
+/* enable/disable this vring */
+typedef int (*vdpa_vring_state_set_t)(int vid, int vring, int state);
+
+/* set features when changed */
+typedef int (*vdpa_feature_set_t)(int vid);
+
+/* destination operations when migration done, e.g. send rarp */
+typedef int (*vdpa_migration_done_t)(int vid);
+
+/* get the vfio group fd */
+typedef int (*vdpa_get_vfio_group_fd_t)(int vid);
+
+/* get the vfio device fd */
+typedef int (*vdpa_get_vfio_device_fd_t)(int vid);
+
+/* get the notify area info of the queue */
+typedef int (*vdpa_get_notify_area_t)(int vid, int qid, uint64_t *offset,
+   uint64_t *size);
+/* device ops */
+struct rte_vdpa_dev_ops {
+   vdpa_dev_conf_t   dev_conf;
+   vdpa_dev_close_t  dev_close;
+   vdpa_vring_state_set_tvring_state_set;
+   vdpa_feature_set_tfeature_set;
+   vdpa_migration_done_t migration_done;
+   vdpa_get_vfio_group_fd_t  get_vfio_group_fd;
+   vdpa_get_vfio_device_fd_t get_vfio_device_fd;
+   vdpa_get_notify_area_tget_notify_area;
+};
+
+/* engine ops */
+struct rte_vd

[dpdk-dev] [PATCH v3 5/5] vhost: add apis for live migration

2018-03-19 Thread Zhihong Wang

This patch adds APIs to enable live migration for non-builtin data paths.

At src side, last_avail/used_idx from the device need to be set into the
virtio_net structure, and the log_base and log_size from the virtio_net
structure need to be set into the device.

At dst side, last_avail/used_idx need to be read from the virtio_net
structure and set into the device.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h   | 51 +++
 lib/librte_vhost/rte_vhost_version.map |  3 ++
 lib/librte_vhost/vhost.c   | 63 ++
 3 files changed, 117 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 77c2a1a8b..0f9303949 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -540,6 +540,57 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
 /**
+ * Get log base and log size of the vhost device
+ *
+ * @param vid
+ *  vhost device ID
+ * @param log_base
+ *  vhost log base
+ * @param log_size
+ *  vhost log size
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
+
+/**
+ * Get last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  vhost queue index
+ * @param last_avail_idx
+ *  vhost last_avail_idx to get
+ * @param last_used_idx
+ *  vhost last_used_idx to get
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+   uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Set last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  vhost queue index
+ * @param last_avail_idx
+ *  last_avail_idx to set
+ * @param last_used_idx
+ *  last_used_idx to set
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+   uint16_t last_avail_idx, uint16_t last_used_idx);
+
+/**
  * Get vdpa engine id for vhost device.
  *
  * @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 8ef2a396c..88bf6cb54 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -74,4 +74,7 @@ EXPERIMENTAL {
rte_vhost_get_vdpa_did;
rte_vhost_driver_get_protocol_features;
rte_vhost_driver_get_queue_num;
+   rte_vhost_get_log_base;
+   rte_vhost_get_vring_base;
+   rte_vhost_set_vring_base;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index f8a5a1c42..c7332c557 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -667,3 +667,66 @@ int rte_vhost_get_vdpa_did(int vid)
 
return dev->did;
 }
+
+int rte_vhost_get_log_base(int vid, uint64_t *log_base,
+   uint64_t *log_size)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   *log_base = dev->log_base;
+   *log_size = dev->log_size;
+
+   return 0;
+}
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+   uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   *last_avail_idx = dev->virtqueue[queue_id]->last_avail_idx;
+   *last_used_idx = dev->virtqueue[queue_id]->last_used_idx;
+
+   return 0;
+}
+
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+   uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   dev->virtqueue[queue_id]->last_avail_idx = last_avail_idx;
+   dev->virtqueue[queue_id]->last_used_idx = last_used_idx;
+
+   return 0;
+}
-- 
2.13.6

[dpdk-dev] [PATCH v4 1/5] vhost: export vhost feature definitions

2018-03-30 Thread Zhihong Wang

This patch exports vhost-user protocol features to support device driver
development.

Signed-off-by: Zhihong Wang 
---
Changes in v3:

 1. Keep macro names the same as in the spec.

 lib/librte_vhost/rte_vhost.h  | 30 ++
 lib/librte_vhost/vhost.h  |  2 --
 lib/librte_vhost/vhost_user.h |  7 ---
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d33206997..671ea5053 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -29,6 +29,36 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 
+/** Protocol features. */
+#ifndef VHOST_USER_PROTOCOL_F_MQ
+#define VHOST_USER_PROTOCOL_F_MQ   0
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_RARP
+#define VHOST_USER_PROTOCOL_F_RARP 2
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_NET_MTU
+#define VHOST_USER_PROTOCOL_F_NET_MTU  4
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ5
+#endif
+
+/** Indicate whether protocol features negotiation is supported. */
+#ifndef VHOST_USER_F_PROTOCOL_FEATURES
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#endif
+
 /**
  * Information relating to memory regions including offsets to
  * addresses in QEMUs memory file.
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 58aec2e0d..2e28e4026 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -174,8 +174,6 @@ struct vhost_msg {
  #define VIRTIO_F_VERSION_1 32
 #endif
 
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
 /* Features supported by this builtin vhost-user net driver. */
 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
(1ULL << VIRTIO_F_ANY_LAYOUT) | \
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 0fafbe6e0..97afabc47 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -14,13 +14,6 @@
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
-#define VHOST_USER_PROTOCOL_F_MQ   0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
-#define VHOST_USER_PROTOCOL_F_RARP 2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
-#define VHOST_USER_PROTOCOL_F_NET_MTU 4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-
 #define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
 (1ULL << 
VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | 
\
-- 
2.13.6

[dpdk-dev] [PATCH v4 2/5] vhost: support selective datapath

2018-03-30 Thread Zhihong Wang

This patch set introduces support for selective datapath in DPDK vhost-user
lib. vDPA stands for vhost Data Path Acceleration. The idea is to support
virtio ring compatible devices to serve virtio driver directly to enable
datapath acceleration.

A set of device ops is defined for device specific operations:

 a. queue_num_get: Called to get supported queue number of the device.

 b. feature_get: Called to get supported features of the device.

 c. protocol_feature_get: Called to get supported protocol features of
the device.

 d. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 e. dev_close: Called to close the actual device when the virtio device
is stopped.

 f. vring_state_set: Called to change the state of the vring in the
actual device when vring state changes.

 g. feature_set: Called to set the negotiated features to device.

 h. migration_done: Called to allow the device to response to RARP
sending.

 i. get_vfio_group_fd: Called to get the VFIO group fd of the device.

 j. get_vfio_device_fd: Called to get the VFIO device fd of the device.

 k. get_notify_area: Called to get the notify area info of the queue.

Signed-off-by: Zhihong Wang 
---
Changes in v4:

 1. Remove the "engine" concept in the lib.

---
Changes in v2:

 1. Add VFIO related vDPA device ops.

 lib/librte_vhost/Makefile  |  4 +-
 lib/librte_vhost/rte_vdpa.h| 94 +
 lib/librte_vhost/rte_vhost_version.map |  6 +++
 lib/librte_vhost/vdpa.c| 96 ++
 4 files changed, 198 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vdpa.h
 create mode 100644 lib/librte_vhost/vdpa.c

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index 5d6c6abae..37044ac03 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -22,9 +22,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev 
-lrte_net
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
-   vhost_user.c virtio_net.c
+   vhost_user.c virtio_net.c vdpa.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
new file mode 100644
index 0..a4bbbd93d
--- /dev/null
+++ b/lib/librte_vhost/rte_vdpa.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_VDPA_H_
+#define _RTE_VDPA_H_
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include 
+#include "rte_vhost.h"
+
+#define MAX_VDPA_NAME_LEN 128
+
+enum vdpa_addr_type {
+   PCI_ADDR,
+   VDPA_ADDR_MAX
+};
+
+struct rte_vdpa_dev_addr {
+   enum vdpa_addr_type type;
+   union {
+   uint8_t __dummy[64];
+   struct rte_pci_addr pci_addr;
+   };
+};
+
+/* Get capabilities of this device */
+typedef int (*vdpa_dev_queue_num_get_t)(int did, uint32_t *queue_num);
+typedef int (*vdpa_dev_feature_get_t)(int did, uint64_t *features);
+
+/* Driver configure/close the device */
+typedef int (*vdpa_dev_conf_t)(int vid);
+typedef int (*vdpa_dev_close_t)(int vid);
+
+/* Enable/disable this vring */
+typedef int (*vdpa_vring_state_set_t)(int vid, int vring, int state);
+
+/* Set features when changed */
+typedef int (*vdpa_feature_set_t)(int vid);
+
+/* Destination operations when migration done */
+typedef int (*vdpa_migration_done_t)(int vid);
+
+/* Get the vfio group fd */
+typedef int (*vdpa_get_vfio_group_fd_t)(int vid);
+
+/* Get the vfio device fd */
+typedef int (*vdpa_get_vfio_device_fd_t)(int vid);
+
+/* Get the notify area info of the queue */
+typedef int (*vdpa_get_notify_area_t)(int vid, int qid, uint64_t *offset,
+   uint64_t *size);
+/* Device ops */
+struct rte_vdpa_dev_ops {
+   vdpa_dev_queue_num_get_t  queue_num_get;
+   vdpa_dev_feature_get_tfeature_get;
+   vdpa_dev_feature_get_tprotocol_feature_get;
+   vdpa_dev_conf_t   dev_conf;
+   vdpa_dev_close_t  dev_close;
+   vdpa_vring_state_set_tvring_state_set;
+   vdpa_feature_set_tfeature_set;
+   vdpa_migration_done_t migration_done;
+   vdpa_get_vfio_group_fd_t  get_vfio_group_fd;
+   vdpa_get_vfio_device_fd_t get_vfio_device_fd;
+   vdpa_get_notify_area_tget_notify_area;
+};
+
+struct rte_vdpa_device {
+   struct rte_vdpa_dev_addr addr;
+   struct rte_vdpa_dev_ops *ops;
+} __rte_cache_aligned;
+
+extern struct rte_vdpa_device *vdpa_devices[];
+extern uint32_t vdpa_device_num;
+
+/* Register a vdpa device, return did if successful, -

[dpdk-dev] [PATCH v4 0/5] vhost: support selective datapath

2018-03-30 Thread Zhihong Wang

This patch set introduces support for selective datapath in DPDK vhost-user
lib. vDPA stands for vhost Data Path Acceleration. The idea is to support
virtio ring compatible devices to serve virtio driver directly to enable
datapath acceleration.

Design details


A vDPA device is a virtio ring compatible backend. It's defined as follows:

struct rte_vdpa_dev_addr {
enum vdpa_addr_type type;
union {
uint8_t __dummy[64];
struct rte_pci_addr pci_addr;
};
};

struct rte_vdpa_dev_ops {
vdpa_dev_queue_num_get_t  queue_num_get;
vdpa_dev_feature_get_tfeature_get;
vdpa_dev_feature_get_tprotocol_feature_get;
vdpa_dev_conf_t   dev_conf;
vdpa_dev_close_t  dev_close;
vdpa_vring_state_set_tvring_state_set;
vdpa_feature_set_tfeature_set;
vdpa_migration_done_t migration_done;
vdpa_get_vfio_group_fd_t  get_vfio_group_fd;
vdpa_get_vfio_device_fd_t get_vfio_device_fd;
vdpa_get_notify_area_tget_notify_area;
};

struct rte_vdpa_device {
struct rte_vdpa_dev_addr addr;
struct rte_vdpa_dev_ops *ops;
} __rte_cache_aligned;


Changes to the current vhost-user lib are:


 1. Make vhost device capabilities configurable to adopt various devices.
Such capabilities include supported features, protocol features, queue
number. APIs are introduced to configure these capabilities.

 2. A set of device ops is defined for device specific operations:

 a. queue_num_get: Called to get supported queue number of the device.

 b. feature_get: Called to get supported features of the device.

 c. protocol_feature_get: Called to get supported protocol features of
the device.

 d. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 e. dev_close: Called to close the actual device when the virtio device
is stopped.

 f. vring_state_set: Called to change the state of the vring in the
actual device when vring state changes.

 g. feature_set: Called to set the negotiated features to device.

 h. migration_done: Called to allow the device to response to RARP
sending.

 i. get_vfio_group_fd: Called to get the VFIO group fd of the device.

 j. get_vfio_device_fd: Called to get the VFIO device fd of the device.

 k. get_notify_area: Called to get the notify area info of the queue.

 3. To make vhost aware of its datapath type, an device id (did) is added
into the vhost data structure to identify the actual device. APIs are
introduced to configure it. When the default software datapath is used,
did is set to -1. When alternative datapath is used, did should be set
to specify which device to use, and each vhost-user socket can have
only 1 connection in this case.

Working process:


 1. Register vDPA device with address and device ops, and get a did.

 2. For vhost device creation:

  a. Register vhost-user socket.

  b. Set did of the vhost-user socket.

  c. Register vhost-user callbacks.

  d. Start to wait for connection.

 3. When connection comes and virtio device data structure is negotiated,
the device will be configured with all needed info.

---
Changes in v4:

 1. Remove the "engine" concept in the lib.

---
Changes in v3:

 1. Keep macro names the same as in the spec.

 2. Export new APIs where they're introduced.

---
Changes in v2:

 1. Ensure negotiated capabilities are supported in vhost-user lib.

 2. Add APIs for live migration.

 3. Configure the data path at the right time.

 4. Add VFIO related vDPA device ops.

 5. Rebase on dpdk-next-virtio.

Zhihong Wang (5):
  vhost: export vhost feature definitions
  vhost: support selective datapath
  vhost: add apis for datapath configuration
  vhost: adapt vhost lib for selective datapath
  vhost: add apis for live migration

 lib/librte_vhost/Makefile  |   4 +-
 lib/librte_vhost/rte_vdpa.h|  94 ++
 lib/librte_vhost/rte_vhost.h   | 143 +
 lib/librte_vhost/rte_vhost_version.map |  14 
 lib/librte_vhost/socket.c  | 130 --
 lib/librte_vhost/vdpa.c|  96 ++
 lib/librte_vhost/vhost.c   |  91 +
 lib/librte_vhost/vhost.h   |  13 ++-
 lib/librte_vhost/vhost_user.c  |  54 +++--
 lib/librte_vhost/vhost_user.h  |   7 --
 10 files changed, 625 insertions(+), 21 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vdpa.h
 create mode 100644 lib/librte_vhost/vdpa.c

-- 
2.13.6

[dpdk-dev] [PATCH v4 5/5] vhost: add apis for live migration

2018-03-30 Thread Zhihong Wang

This patch adds APIs to enable live migration for non-builtin data paths.

At src side, last_avail/used_idx from the device need to be set into the
virtio_net structure, and the log_base and log_size from the virtio_net
structure need to be set into the device.

At dst side, last_avail/used_idx need to be read from the virtio_net
structure and set into the device.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h   | 51 +++
 lib/librte_vhost/rte_vhost_version.map |  3 ++
 lib/librte_vhost/vhost.c   | 63 ++
 3 files changed, 117 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 3c3334d3e..ff98ebd59 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -516,6 +516,57 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
 /**
+ * Get log base and log size of the vhost device
+ *
+ * @param vid
+ *  vhost device ID
+ * @param log_base
+ *  vhost log base
+ * @param log_size
+ *  vhost log size
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
+
+/**
+ * Get last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  vhost queue index
+ * @param last_avail_idx
+ *  vhost last_avail_idx to get
+ * @param last_used_idx
+ *  vhost last_used_idx to get
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+   uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Set last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  vhost queue index
+ * @param last_avail_idx
+ *  last_avail_idx to set
+ * @param last_used_idx
+ *  last_used_idx to set
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+   uint16_t last_avail_idx, uint16_t last_used_idx);
+
+/**
  * Get vdpa device id for vhost device.
  *
  * @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 812ccd72b..81479d433 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -69,4 +69,7 @@ EXPERIMENTAL {
rte_vhost_get_vdpa_did;
rte_vhost_driver_get_protocol_features;
rte_vhost_driver_get_queue_num;
+   rte_vhost_get_log_base;
+   rte_vhost_get_vring_base;
+   rte_vhost_set_vring_base;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 66b6b492f..b52f7dad4 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -642,3 +642,66 @@ int rte_vhost_get_vdpa_did(int vid)
 
return dev->did;
 }
+
+int rte_vhost_get_log_base(int vid, uint64_t *log_base,
+   uint64_t *log_size)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   *log_base = dev->log_base;
+   *log_size = dev->log_size;
+
+   return 0;
+}
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+   uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   *last_avail_idx = dev->virtqueue[queue_id]->last_avail_idx;
+   *last_used_idx = dev->virtqueue[queue_id]->last_used_idx;
+
+   return 0;
+}
+
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+   uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   dev->virtqueue[queue_id]->last_avail_idx = last_avail_idx;
+   dev->virtqueue[queue_id]->last_used_idx = last_used_idx;
+
+   return 0;
+}
-- 
2.13.6

[dpdk-dev] [PATCH v4 3/5] vhost: add apis for datapath configuration

2018-03-30 Thread Zhihong Wang

This patch adds APIs for datapath configuration.

The did of the vhost-user socket can be set to identify the backend device,
in this case each vhost-user socket can have only 1 connection. The did is
set to -1 by default when the software datapath is used.

Signed-off-by: Zhihong Wang 
---
Changes in v4:

 1. Remove the "engine" concept in the lib.

 lib/librte_vhost/rte_vhost.h   | 35 +
 lib/librte_vhost/rte_vhost_version.map |  3 +++
 lib/librte_vhost/socket.c  | 36 ++
 lib/librte_vhost/vhost.c   | 25 +++
 lib/librte_vhost/vhost.h   |  9 +
 5 files changed, 108 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 671ea5053..d50f4c67d 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -200,6 +200,30 @@ int rte_vhost_driver_register(const char *path, uint64_t 
flags);
 int rte_vhost_driver_unregister(const char *path);
 
 /**
+ * Set the device id, enforce single connection per socket
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param did
+ *  Device id
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_set_vdpa_did(const char *path, int did);
+
+/**
+ * Get the device id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  Device id, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_vdpa_did(const char *path);
+
+/**
  * Set the feature bits the vhost-user driver supports.
  *
  * @param path
@@ -464,6 +488,17 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
  */
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
+/**
+ * Get vdpa device id for vhost device.
+ *
+ * @param vid
+ *  vhost device ID
+ * @return
+ *  device id
+ */
+int __rte_experimental
+rte_vhost_get_vdpa_did(int vid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 7bcffb490..6e2d5364a 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -64,4 +64,7 @@ EXPERIMENTAL {
rte_vdpa_register_device;
rte_vdpa_unregister_device;
rte_vdpa_find_device_id;
+   rte_vhost_driver_set_vdpa_did;
+   rte_vhost_driver_get_vdpa_did;
+   rte_vhost_get_vdpa_did;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index cfc31e179..3d58da94e 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -52,6 +52,13 @@ struct vhost_user_socket {
uint64_t supported_features;
uint64_t features;
 
+   /*
+* Device id to identify a specific backend device.
+* It's set to -1 for the default software implementation.
+* If valid, one socket can have 1 connection only.
+*/
+   int did;
+
struct vhost_device_ops const *notify_ops;
 };
 
@@ -545,6 +552,35 @@ find_vhost_user_socket(const char *path)
 }
 
 int
+rte_vhost_driver_set_vdpa_did(const char *path, int did)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->did = did;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_vdpa_did(const char *path)
+{
+   struct vhost_user_socket *vsocket;
+   int did = -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   did = vsocket->did;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return did;
+}
+
+int
 rte_vhost_driver_disable_features(const char *path, uint64_t features)
 {
struct vhost_user_socket *vsocket;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index f6f12a03b..1740cc1ab 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -283,6 +283,7 @@ vhost_new_device(void)
dev->vid = i;
dev->flags = VIRTIO_DEV_BUILTIN_VIRTIO_NET;
dev->slave_req_fd = -1;
+   dev->did = -1;
 
return i;
 }
@@ -311,6 +312,20 @@ vhost_destroy_device(int vid)
 }
 
 void
+vhost_set_vdpa_did(int vid, int did)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (did < 0)
+   return;
+
+   if (dev == NULL)
+   return;
+
+   dev->did = did;
+}
+
+void
 vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
 {
struct virtio_net *dev;
@@ -614,3 +629,13 @@ rte_vhost_rx_queue_count(int vid, uint16_t qid)
 
return *((volatile uint16_t *)&vq->avail->idx) - vq->last_avail_idx;
 }
+
+int rte_vhost_get_vdpa_did(int vid)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (dev == NULL)
+

[dpdk-dev] [PATCH v4 4/5] vhost: adapt vhost lib for selective datapath

2018-03-30 Thread Zhihong Wang

This patch adapts vhost lib for selective datapath by calling device ops
at the corresponding stage.

Signed-off-by: Zhihong Wang 
---
Changes in v4:

 1. Remove the "engine" concept in the lib.

---
Changes in v2:

 1. Ensure negotiated capabilities are supported in vhost-user lib.

 2. Configure the data path at the right time.

 lib/librte_vhost/rte_vhost.h   | 27 ++
 lib/librte_vhost/rte_vhost_version.map |  2 +
 lib/librte_vhost/socket.c  | 94 --
 lib/librte_vhost/vhost.c   |  3 ++
 lib/librte_vhost/vhost.h   |  2 +
 lib/librte_vhost/vhost_user.c  | 54 +--
 6 files changed, 172 insertions(+), 10 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d50f4c67d..3c3334d3e 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -279,6 +279,33 @@ int rte_vhost_driver_disable_features(const char *path, 
uint64_t features);
 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
 
 /**
+ * Get the protocol feature bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param protocol_features
+ *  A pointer to store the queried protocol feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features);
+
+/**
+ * Get the queue number bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param queue_num
+ *  A pointer to store the queried queue number bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
+
+/**
  * Get the feature bits after negotiation
  *
  * @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 6e2d5364a..812ccd72b 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -67,4 +67,6 @@ EXPERIMENTAL {
rte_vhost_driver_set_vdpa_did;
rte_vhost_driver_get_vdpa_did;
rte_vhost_get_vdpa_did;
+   rte_vhost_driver_get_protocol_features;
+   rte_vhost_driver_get_queue_num;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 3d58da94e..ba7b422a0 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -216,6 +216,8 @@ vhost_user_add_connection(int fd, struct vhost_user_socket 
*vsocket)
 
vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
 
+   vhost_set_vdpa_did(vid, vsocket->did);
+
if (vsocket->dequeue_zero_copy)
vhost_enable_dequeue_zero_copy(vid);
 
@@ -648,20 +650,102 @@ int
 rte_vhost_driver_get_features(const char *path, uint64_t *features)
 {
struct vhost_user_socket *vsocket;
+   uint64_t vdpa_features;
+   int did = -1;
+   int ret = 0;
 
pthread_mutex_lock(&vhost_user.mutex);
vsocket = find_vhost_user_socket(path);
-   if (vsocket)
-   *features = vsocket->features;
+   if (vsocket) {
+   did = vsocket->did;
+   if (did < 0 || vdpa_devices[did]->ops->feature_get == NULL)
+   *features = vsocket->features;
+   else if (vdpa_devices[did]->ops->feature_get(did,
+   &vdpa_features) < 0) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "failed to get vdpa features "
+   "for socket file %s.\n", path);
+   ret = -1;
+   } else
+   *features = vsocket->features & vdpa_features;
+   } else {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "socket file %s is not registered yet.\n", path);
+   ret = -1;
+   }
pthread_mutex_unlock(&vhost_user.mutex);
 
-   if (!vsocket) {
+   return ret;
+}
+
+int
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features)
+{
+   struct vhost_user_socket *vsocket;
+   uint64_t vdpa_protocol_features;
+   int did = -1;
+   int ret = 0;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket) {
+   did = vsocket->did;
+   if (did < 0 || vdpa_devices[did]->ops->protocol_feature_get
+   == NULL)
+   *protocol_features = VHOST_USER_PROTOCOL_FEATURES;
+   else if (vdpa_devices[did]->ops->protocol_feature_get(did,
+   &vdpa_protocol_features) < 0) {
+

[dpdk-dev] [PATCH v5 2/5] vhost: support selective datapath

2018-04-02 Thread Zhihong Wang

This patch set introduces support for selective datapath in DPDK vhost-user
lib. vDPA stands for vhost Data Path Acceleration. The idea is to support
virtio ring compatible devices to serve virtio driver directly to enable
datapath acceleration.

A set of device ops is defined for device specific operations:

 a. get_queue_num: Called to get supported queue number of the device.

 b. get_features: Called to get supported features of the device.

 c. get_protocol_features: Called to get supported protocol features of
the device.

 d. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 e. dev_close: Called to close the actual device when the virtio device
is stopped.

 f. set_vring_state: Called to change the state of the vring in the
actual device when vring state changes.

 g. set_features: Called to set the negotiated features to device.

 h. migration_done: Called to allow the device to response to RARP
sending.

 i. get_vfio_group_fd: Called to get the VFIO group fd of the device.

 j. get_vfio_device_fd: Called to get the VFIO device fd of the device.

 k. get_notify_area: Called to get the notify area info of the queue.

Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Rename the vDPA device ops to follow convention.

 2. Improve sanity check.

---
Changes in v4:

 1. Remove the "engine" concept in the lib.

---
Changes in v2:

 1. Add VFIO related vDPA device ops.

 lib/librte_vhost/Makefile  |   4 +-
 lib/librte_vhost/rte_vdpa.h|  87 +
 lib/librte_vhost/rte_vhost_version.map |   7 ++
 lib/librte_vhost/vdpa.c| 115 +
 4 files changed, 211 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vdpa.h
 create mode 100644 lib/librte_vhost/vdpa.c

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index 5d6c6abae..37044ac03 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -22,9 +22,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev 
-lrte_net
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
-   vhost_user.c virtio_net.c
+   vhost_user.c virtio_net.c vdpa.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
new file mode 100644
index 0..90465ca26
--- /dev/null
+++ b/lib/librte_vhost/rte_vdpa.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_VDPA_H_
+#define _RTE_VDPA_H_
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include 
+#include "rte_vhost.h"
+
+#define MAX_VDPA_NAME_LEN 128
+
+enum vdpa_addr_type {
+   PCI_ADDR,
+   VDPA_ADDR_MAX
+};
+
+struct rte_vdpa_dev_addr {
+   enum vdpa_addr_type type;
+   union {
+   uint8_t __dummy[64];
+   struct rte_pci_addr pci_addr;
+   };
+};
+
+struct rte_vdpa_dev_ops {
+   /* Get capabilities of this device */
+   int (*get_queue_num)(int did, uint32_t *queue_num);
+   int (*get_features)(int did, uint64_t *features);
+   int (*get_protocol_features)(int did, uint64_t *protocol_features);
+
+   /* Driver configure/close the device */
+   int (*dev_conf)(int vid);
+   int (*dev_close)(int vid);
+
+   /* Enable/disable this vring */
+   int (*set_vring_state)(int vid, int vring, int state);
+
+   /* Set features when changed */
+   int (*set_features)(int vid);
+
+   /* Destination operations when migration done */
+   int (*migration_done)(int vid);
+
+   /* Get the vfio group fd */
+   int (*get_vfio_group_fd)(int vid);
+
+   /* Get the vfio device fd */
+   int (*get_vfio_device_fd)(int vid);
+
+   /* Get the notify area info of the queue */
+   int (*get_notify_area)(int vid, int qid,
+   uint64_t *offset, uint64_t *size);
+
+   /* Reserved for future extension */
+   void *reserved[5];
+};
+
+struct rte_vdpa_device {
+   struct rte_vdpa_dev_addr addr;
+   struct rte_vdpa_dev_ops *ops;
+} __rte_cache_aligned;
+
+/* Register a vdpa device, return did if successful, -1 on failure */
+int __rte_experimental
+rte_vdpa_register_device(struct rte_vdpa_dev_addr *addr,
+   struct rte_vdpa_dev_ops *ops);
+
+/* Unregister a vdpa device, return -1 on failure */
+int __rte_experimental
+rte_vdpa_unregister_device(int did);
+
+/* Find did of a vdpa device, return -1 on failure */
+int __rte_experimental
+rte_vdpa_find_device_id(struct rte_vdpa_dev_addr *addr);
+
+/* Find a vdpa

[dpdk-dev] [PATCH v5 4/5] vhost: adapt vhost lib for selective datapath

2018-04-02 Thread Zhihong Wang

This patch adapts vhost lib for selective datapath by calling device ops
at the corresponding stage.

Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Improve sanity check.

---
Changes in v4:

 1. Remove the "engine" concept in the lib.

---
Changes in v2:

 1. Ensure negotiated capabilities are supported in vhost-user lib.

 2. Configure the data path at the right time.

 lib/librte_vhost/rte_vhost.h   |  27 
 lib/librte_vhost/rte_vhost_version.map |   2 +
 lib/librte_vhost/socket.c  | 113 +++--
 lib/librte_vhost/vhost.c   |   6 ++
 lib/librte_vhost/vhost.h   |   2 +
 lib/librte_vhost/vhost_user.c  |  70 ++--
 6 files changed, 211 insertions(+), 9 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 8f35167f2..fe0338d00 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -290,6 +290,33 @@ int rte_vhost_driver_disable_features(const char *path, 
uint64_t features);
 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
 
 /**
+ * Get the protocol feature bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param protocol_features
+ *  A pointer to store the queried protocol feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features);
+
+/**
+ * Get the queue number bits before feature negotiation.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param queue_num
+ *  A pointer to store the queried queue number bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
+
+/**
  * Get the feature bits after negotiation
  *
  * @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index e30285d7f..55e0af7e7 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -69,4 +69,6 @@ EXPERIMENTAL {
rte_vhost_driver_detach_vdpa_device;
rte_vhost_driver_get_vdpa_device_id;
rte_vhost_get_vdpa_device_id;
+   rte_vhost_driver_get_protocol_features;
+   rte_vhost_driver_get_queue_num;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index c26940f7a..9a44f0d9e 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -216,6 +216,8 @@ vhost_user_add_connection(int fd, struct vhost_user_socket 
*vsocket)
 
vhost_set_builtin_virtio_net(vid, vsocket->use_builtin_virtio_net);
 
+   vhost_attach_vdpa_device(vid, vsocket->vdpa_dev_id);
+
if (vsocket->dequeue_zero_copy)
vhost_enable_dequeue_zero_copy(vid);
 
@@ -665,20 +667,123 @@ int
 rte_vhost_driver_get_features(const char *path, uint64_t *features)
 {
struct vhost_user_socket *vsocket;
+   uint64_t vdpa_features;
+   struct rte_vdpa_device *vdpa_dev;
+   int did = -1;
+   int ret = 0;
 
pthread_mutex_lock(&vhost_user.mutex);
vsocket = find_vhost_user_socket(path);
-   if (vsocket)
+   if (!vsocket) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "socket file %s is not registered yet.\n", path);
+   ret = -1;
+   goto unlock_exit;
+   }
+
+   did = vsocket->vdpa_dev_id;
+   vdpa_dev = rte_vdpa_get_device(did);
+   if (!vdpa_dev || !vdpa_dev->ops->get_features) {
*features = vsocket->features;
+   goto unlock_exit;
+   }
+
+   if (vdpa_dev->ops->get_features(did, &vdpa_features) < 0) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "failed to get vdpa features "
+   "for socket file %s.\n", path);
+   ret = -1;
+   goto unlock_exit;
+   }
+
+   *features = vsocket->features & vdpa_features;
+
+unlock_exit:
pthread_mutex_unlock(&vhost_user.mutex);
+   return ret;
+}
 
+int
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features)
+{
+   struct vhost_user_socket *vsocket;
+   uint64_t vdpa_protocol_features;
+   struct rte_vdpa_device *vdpa_dev;
+   int did = -1;
+   int ret = 0;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
if (!vsocket) {
RTE_LOG(ERR, VHOST_CONFIG,
"socket file %s is not registered yet.\n", path);
-   return -1;
-   } else {
-   return 0;
+   ret = -1;
+   goto unlock_exit;
}
+
+   did = vsocket->vdpa_dev_id;
+

[dpdk-dev] [PATCH v5 1/5] vhost: export vhost feature definitions

2018-04-02 Thread Zhihong Wang

This patch exports vhost-user protocol features to support device driver
development.

Signed-off-by: Zhihong Wang 
Reviewed-by: Maxime Coquelin 
---
Changes in v3:

 1. Keep macro names the same as in the spec.

 lib/librte_vhost/rte_vhost.h  | 30 ++
 lib/librte_vhost/vhost.h  |  2 --
 lib/librte_vhost/vhost_user.h |  7 ---
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d33206997..671ea5053 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -29,6 +29,36 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 
+/** Protocol features. */
+#ifndef VHOST_USER_PROTOCOL_F_MQ
+#define VHOST_USER_PROTOCOL_F_MQ   0
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_RARP
+#define VHOST_USER_PROTOCOL_F_RARP 2
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_NET_MTU
+#define VHOST_USER_PROTOCOL_F_NET_MTU  4
+#endif
+
+#ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ5
+#endif
+
+/** Indicate whether protocol features negotiation is supported. */
+#ifndef VHOST_USER_F_PROTOCOL_FEATURES
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#endif
+
 /**
  * Information relating to memory regions including offsets to
  * addresses in QEMUs memory file.
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 58aec2e0d..2e28e4026 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -174,8 +174,6 @@ struct vhost_msg {
  #define VIRTIO_F_VERSION_1 32
 #endif
 
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
 /* Features supported by this builtin vhost-user net driver. */
 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
(1ULL << VIRTIO_F_ANY_LAYOUT) | \
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 0fafbe6e0..97afabc47 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -14,13 +14,6 @@
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
-#define VHOST_USER_PROTOCOL_F_MQ   0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
-#define VHOST_USER_PROTOCOL_F_RARP 2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
-#define VHOST_USER_PROTOCOL_F_NET_MTU 4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-
 #define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
 (1ULL << 
VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | 
\
-- 
2.13.6

[dpdk-dev] [PATCH v5 3/5] vhost: add apis for datapath configuration

2018-04-02 Thread Zhihong Wang

This patch adds APIs for datapath configuration.

The did of the vhost-user socket can be set to identify the backend device,
in this case each vhost-user socket can have only 1 connection. The did is
set to -1 by default when the software datapath is used.

Signed-off-by: Zhihong Wang 
---
Changes in v5:

 1. Rename APIs for better description.

 2. Improve sanity check.

---
Changes in v4:

 1. Remove the "engine" concept in the lib.

 lib/librte_vhost/rte_vhost.h   | 46 +
 lib/librte_vhost/rte_vhost_version.map |  4 +++
 lib/librte_vhost/socket.c  | 53 ++
 lib/librte_vhost/vhost.c   | 36 +++
 lib/librte_vhost/vhost.h   | 10 +++
 5 files changed, 149 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 671ea5053..8f35167f2 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -200,6 +200,41 @@ int rte_vhost_driver_register(const char *path, uint64_t 
flags);
 int rte_vhost_driver_unregister(const char *path);
 
 /**
+ * Set the vdpa device id, enforce single connection per socket
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param did
+ *  Device id
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_attach_vdpa_device(const char *path, int did);
+
+/**
+ * Unset the vdpa device id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_detach_vdpa_device(const char *path);
+
+/**
+ * Get the device id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  Device id, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_driver_get_vdpa_device_id(const char *path);
+
+/**
  * Set the feature bits the vhost-user driver supports.
  *
  * @param path
@@ -464,6 +499,17 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
  */
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
+/**
+ * Get vdpa device id for vhost device.
+ *
+ * @param vid
+ *  vhost device id
+ * @return
+ *  device id
+ */
+int __rte_experimental
+rte_vhost_get_vdpa_device_id(int vid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index d3453a2a7..e30285d7f 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -65,4 +65,8 @@ EXPERIMENTAL {
rte_vdpa_unregister_device;
rte_vdpa_find_device_id;
rte_vdpa_get_device;
+   rte_vhost_driver_attach_vdpa_device;
+   rte_vhost_driver_detach_vdpa_device;
+   rte_vhost_driver_get_vdpa_device_id;
+   rte_vhost_get_vdpa_device_id;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index cfc31e179..c26940f7a 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -52,6 +52,13 @@ struct vhost_user_socket {
uint64_t supported_features;
uint64_t features;
 
+   /*
+* Device id to identify a specific backend device.
+* It's set to -1 for the default software implementation.
+* If valid, one socket can have 1 connection only.
+*/
+   int vdpa_dev_id;
+
struct vhost_device_ops const *notify_ops;
 };
 
@@ -545,6 +552,52 @@ find_vhost_user_socket(const char *path)
 }
 
 int
+rte_vhost_driver_attach_vdpa_device(const char *path, int did)
+{
+   struct vhost_user_socket *vsocket;
+
+   if (rte_vdpa_get_device(did) == NULL)
+   return -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->vdpa_dev_id = did;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_detach_vdpa_device(const char *path)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->vdpa_dev_id = -1;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_vdpa_device_id(const char *path)
+{
+   struct vhost_user_socket *vsocket;
+   int did = -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   did = vsocket->vdpa_dev_id;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return did;
+}
+
+int
 rte_vhost_driver_disable_features(const char *path, uint64_t features)
 {
struct vhost_user_socket *vsocket;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index f6f12a03b..16b0f9a6f 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -283,6 +283,7 @@ vhost_ne

[dpdk-dev] [PATCH v5 0/5] vhost: support selective datapath

2018-04-02 Thread Zhihong Wang

This patch set introduces support for selective datapath in DPDK vhost-user
lib. vDPA stands for vhost Data Path Acceleration. The idea is to support
virtio ring compatible devices to serve virtio driver directly to enable
datapath acceleration.

Design details


A vDPA device is a virtio ring compatible backend. It's defined as follows:

struct rte_vdpa_dev_addr {
enum vdpa_addr_type type;
union {
uint8_t __dummy[64];
struct rte_pci_addr pci_addr;
};
};

struct rte_vdpa_dev_ops {
/* Get capabilities of this device */
int (*get_queue_num)(int did, uint32_t *queue_num);
int (*get_features)(int did, uint64_t *features);
int (*get_protocol_features)(int did, uint64_t *protocol_features);

/* Driver configure/close the device */
int (*dev_conf)(int vid);
int (*dev_close)(int vid);

/* Enable/disable this vring */
int (*set_vring_state)(int vid, int vring, int state);

/* Set features when changed */
int (*set_features)(int vid);

/* Destination operations when migration done */
int (*migration_done)(int vid);

/* Get the vfio group fd */
int (*get_vfio_group_fd)(int vid);

/* Get the vfio device fd */
int (*get_vfio_device_fd)(int vid);

/* Get the notify area info of the queue */
int (*get_notify_area)(int vid, int qid,
uint64_t *offset, uint64_t *size);

/* Reserved for future extension */
void *reserved[5];
};

struct rte_vdpa_device {
struct rte_vdpa_dev_addr addr;
struct rte_vdpa_dev_ops *ops;
} __rte_cache_aligned;


Changes to the current vhost-user lib are:


 1. Make vhost device capabilities configurable to adopt various devices.
Such capabilities include supported features, protocol features, queue
number. APIs are introduced to configure these capabilities.

 2. A set of device ops is defined for device specific operations:

 a. get_queue_num: Called to get supported queue number of the device.

 b. get_features: Called to get supported features of the device.

 c. get_protocol_features: Called to get supported protocol features of
the device.

 d. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 e. dev_close: Called to close the actual device when the virtio device
is stopped.

 f. set_vring_state: Called to change the state of the vring in the
actual device when vring state changes.

 g. set_features: Called to set the negotiated features to device.

 h. migration_done: Called to allow the device to response to RARP
sending.

 i. get_vfio_group_fd: Called to get the VFIO group fd of the device.

 j. get_vfio_device_fd: Called to get the VFIO device fd of the device.

 k. get_notify_area: Called to get the notify area info of the queue.

 3. To make vhost aware of its datapath type, an device id (did) is added
into the vhost data structure to identify the actual device. APIs are
introduced to configure it. When the default software datapath is used,
did is set to -1. When alternative datapath is used, did should be set
to specify which device to use, and each vhost-user socket can have
only 1 connection in this case.

Working process:


 1. Register vDPA device with address and device ops, and get a did.

 2. For vhost device creation:

  a. Register vhost-user socket.

  b. Set did of the vhost-user socket.

  c. Register vhost-user callbacks.

  d. Start to wait for connection.

 3. When connection comes and virtio device data structure is negotiated,
the device will be configured with all needed info.

---
Changes in v5:

 1. Rename the vDPA device ops to follow convention.

 2. Rename APIs for better description.

 3. Improve sanity check.

---
Changes in v4:

 1. Remove the "engine" concept in the lib.

---
Changes in v3:

 1. Keep macro names the same as in the spec.

 2. Export new APIs where they're introduced.

---
Changes in v2:

 1. Ensure negotiated capabilities are supported in vhost-user lib.

 2. Add APIs for live migration.

 3. Configure the data path at the right time.

 4. Add VFIO related vDPA device ops.

 5. Rebase on dpdk-next-virtio.

Zhihong Wang (5):
  vhost: export vhost feature definitions
  vhost: support selective datapath
  vhost: add apis for datapath configuration
  vhost: adapt vhost lib for selective datapath
  vhost: add apis for live migration

 lib/librte_vhost/Makefile  |   4 +-
 lib/librte_vhost/rte_vdpa.h|  87 +
 lib/librte_vhost/rte_vhost.h   | 154 ++
 lib/librte_vhost/rte_vhost_version.map |  16 
 lib/librte_vhost/socket.c  | 166 -
 lib/librte_vhost/vdpa.c

[dpdk-dev] [PATCH v5 5/5] vhost: add apis for live migration

2018-04-02 Thread Zhihong Wang

This patch adds APIs to enable live migration for non-builtin data paths.

At src side, last_avail/used_idx from the device need to be set into the
virtio_net structure, and the log_base and log_size from the virtio_net
structure need to be set into the device.

At dst side, last_avail/used_idx need to be read from the virtio_net
structure and set into the device.

Signed-off-by: Zhihong Wang 
Reviewed-by: Maxime Coquelin 
---
 lib/librte_vhost/rte_vhost.h   | 51 +++
 lib/librte_vhost/rte_vhost_version.map |  3 ++
 lib/librte_vhost/vhost.c   | 63 ++
 3 files changed, 117 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index fe0338d00..807da8c4b 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -527,6 +527,57 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
 /**
+ * Get log base and log size of the vhost device
+ *
+ * @param vid
+ *  vhost device ID
+ * @param log_base
+ *  vhost log base
+ * @param log_size
+ *  vhost log size
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
+
+/**
+ * Get last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  vhost queue index
+ * @param last_avail_idx
+ *  vhost last_avail_idx to get
+ * @param last_used_idx
+ *  vhost last_used_idx to get
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+   uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Set last_avail/used_idx of the vhost virtqueue
+ *
+ * @param vid
+ *  vhost device ID
+ * @param queue_id
+ *  vhost queue index
+ * @param last_avail_idx
+ *  last_avail_idx to set
+ * @param last_used_idx
+ *  last_used_idx to set
+ * @return
+ *  0 on success, -1 on failure
+ */
+int __rte_experimental
+rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+   uint16_t last_avail_idx, uint16_t last_used_idx);
+
+/**
  * Get vdpa device id for vhost device.
  *
  * @param vid
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index 55e0af7e7..9d3049342 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -71,4 +71,7 @@ EXPERIMENTAL {
rte_vhost_get_vdpa_device_id;
rte_vhost_driver_get_protocol_features;
rte_vhost_driver_get_queue_num;
+   rte_vhost_get_log_base;
+   rte_vhost_get_vring_base;
+   rte_vhost_set_vring_base;
 } DPDK_18.02;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index b1afd693a..1f17cdd75 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -656,3 +656,66 @@ int rte_vhost_get_vdpa_device_id(int vid)
 
return dev->vdpa_dev_id;
 }
+
+int rte_vhost_get_log_base(int vid, uint64_t *log_base,
+   uint64_t *log_size)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   *log_base = dev->log_base;
+   *log_size = dev->log_size;
+
+   return 0;
+}
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+   uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   *last_avail_idx = dev->virtqueue[queue_id]->last_avail_idx;
+   *last_used_idx = dev->virtqueue[queue_id]->last_used_idx;
+
+   return 0;
+}
+
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+   uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+   struct virtio_net *dev = get_device(vid);
+
+   if (!dev)
+   return -1;
+
+   if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+   RTE_LOG(ERR, VHOST_DATA,
+   "(%d) %s: built-in vhost net backend is disabled.\n",
+   dev->vid, __func__);
+   return -1;
+   }
+
+   dev->virtqueue[queue_id]->last_avail_idx = last_avail_idx;
+   dev->virtqueue[queue_id]->last_used_idx = last_used_idx;
+
+   return 0;
+}
-- 
2.13.6

[dpdk-dev] [PATCH] vhost: fix ANY_LAYOUT definition for old kernels

2018-01-30 Thread Zhihong Wang

This patch fixes compile failure with old kernels which have no
VIRTIO_F_ANY_LAYOUT defined.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/vhost.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index ba80584..646aad3 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -130,6 +130,10 @@ struct vhost_virtqueue {
  #define VIRTIO_NET_F_MTU 3
 #endif
 
+#ifndef VIRTIO_F_ANY_LAYOUT
+ #define VIRTIO_F_ANY_LAYOUT   27
+#endif
+
 /* Declare IOMMU related bits for older kernels */
 #ifndef VIRTIO_F_IOMMU_PLATFORM
 
-- 
2.7.5

[dpdk-dev] [PATCH 0/7] vhost: support selective datapath

2018-02-02 Thread Zhihong Wang

This patch set introduces support for selective datapath in DPDK vhost-user
lib. vDPA stands for vhost Data Path Acceleration. The idea is to enable
various types of virtio-compatible devices to do data transfer with virtio
driver directly to enable acceleration.

The default datapath is the existing software implementation, more options
will be available when new engines are added.

Design details


An engine is a group of virtio-compatible devices. The definition of engine
is as follows:

struct rte_vdpa_eng_addr {
union {
uint8_t __dummy[64];

struct {
struct rte_pci_addr pci_addr;
};
};
};

struct rte_vdpa_eng_info {
char name[MAX_VDPA_NAME_LEN];
struct rte_vdpa_eng_addr *addr;
};

struct rte_vdpa_dev_ops {
vdpa_dev_conf_tdev_conf;
vdpa_dev_close_t   dev_close;
vdpa_vring_state_set_t vring_state_set;
vdpa_feature_set_t feature_set;
vdpa_migration_done_t  migration_done;
};

struct rte_vdpa_eng_ops {
vdpa_eng_init_t   eng_init;
vdpa_eng_uninit_t eng_uninit;
vdpa_info_query_t info_query;
};

struct rte_vdpa_eng_driver {
const char *name;
struct rte_vdpa_eng_ops eng_ops;
struct rte_vdpa_dev_ops dev_ops;
} __rte_cache_aligned;

struct rte_vdpa_engine {
struct rte_vdpa_eng_infoeng_info;
struct rte_vdpa_eng_driver *eng_drv;
} __rte_cache_aligned;

A set of engine ops is defined in rte_vdpa_eng_ops for engine init, uninit,
and attributes reporting. The attributes are defined as follows:

struct rte_vdpa_eng_attr {
uint64_t features;
uint64_t protocol_features;
uint32_t queue_num;
uint32_t dev_num;
};

A set of device ops is defined in rte_vdpa_dev_ops for each virtio device
in the engine to do device specific operations.

Changes to the current vhost-user lib are:


 1. Make vhost device capabilities configurable to adopt various engines.
Such capabilities include supported features, protocol features, queue
number. APIs are introduced to let app configure these capabilities.

 2. In addition to the existing vhost framework, a set of callbacks is
added for vhost to call the driver for device operations at the right
time:

 a. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 b. dev_close: Called to close the actual device when the virtio device
is stopped.

 c. vring_state_set: Called to change the state of the vring in the
actual device when vring state changes.

 d. feature_set: Called to set the negotiated features to device.

 e. migration_done: Called to allow the device to response to RARP
sending.

 3. To make vhost aware of its own type, an engine id (eid) and a device
id (did) are added into the vhost data structure to identify the actual
device. APIs are introduced to let app configure them. When the default
software datapath is used, eid and did are set to -1. When alternative
datapath is used, eid and did are set by app to specify which device to
use. Each vhost-user socket can have only 1 connection in this case.

Working process:


 1. Register driver during DPDK initialization.

 2. Register engine with driver name and address.

 3. Get engine attributes.

 4. For vhost device creation:

  a. Register vhost-user socket.

  b. Set eid and did of the vhost-user socket.

  c. Set attributes of the vhost-user socket.

  d. Register vhost-user callbacks.

  e. Start to wait for connection.

 4. When connection comes and virtio device data structure is negotiated,
configure the device with all needed info.

Zhihong Wang (7):
  vhost: make capabilities configurable
  vhost: expose vhost feature definitions
  vhost: support selective datapath
  vhost: add apis for datapath configuration
  vhost: adapt vhost lib for selective datapath
  vhost: get callfd before device setup
  vhost: expose new apis

 lib/librte_vhost/Makefile  |   4 +-
 lib/librte_vhost/rte_vdpa.h| 119 +++
 lib/librte_vhost/rte_vhost.h   | 136 +++
 lib/librte_vhost/rte_vhost_version.map |  18 
 lib/librte_vhost/socket.c  | 145 +
 lib/librte_vhost/vdpa.c| 125 
 lib/librte_vhost/vhost.c   |  49 +++
 lib/librte_vhost/vhost.h   |  14 +++-
 lib/librte_vhost/vhost_user.c  | 108 +++-
 lib/librte_vhost/vhost_user.h  |  20 ++---
 10 files changed, 700 insertions(+), 38 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vdpa.h
 create mode 100644 lib/librte_vhost/vdpa.c

-- 
2.7.5

[dpdk-dev] [PATCH 1/7] vhost: make capabilities configurable

2018-02-02 Thread Zhihong Wang

This patch makes vhost device capabilities configurable to adopt new
devices, since different devices may have different capabilities, like
different combinations of supported features, or different number of
queues. APIs are introduced to let app configure these capabilities.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h  | 50 
 lib/librte_vhost/socket.c | 77 +++
 lib/librte_vhost/vhost_user.c | 48 ---
 3 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d332069..12cf48f 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -182,6 +182,56 @@ int rte_vhost_driver_unregister(const char *path);
 int rte_vhost_driver_set_features(const char *path, uint64_t features);
 
 /**
+ * Get the protocol feature bits.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param protocol_features
+ *  A pointer to store the queried protocol feature bits
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features);
+
+/**
+ * Set the protocol feature bits the vhost-user driver supports.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param protocol_features
+ *  Supported protocol features
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_protocol_features(const char *path,
+   uint64_t protocol_features);
+
+/**
+ * Get the queue number.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param queue_num
+ *  A pointer to store the queried queue number
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_queue_num(const char *path, uint16_t *queue_num);
+
+/**
+ * Set the queue number the vhost-user driver supports.
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param queue_num
+ *  Supported queue number
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_queue_num(const char *path, uint16_t queue_num);
+
+/**
  * Enable vhost-user driver features.
  *
  * Note that
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 6e3857e..e1d0036 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -49,7 +49,10 @@ struct vhost_user_socket {
 * features negotiation.
 */
uint64_t supported_features;
+   uint64_t supported_protocol_features;
uint64_t features;
+   uint64_t protocol_features;
+   uint16_t queue_num;
 
struct vhost_device_ops const *notify_ops;
 };
@@ -593,6 +596,75 @@ rte_vhost_driver_get_features(const char *path, uint64_t 
*features)
}
 }
 
+int rte_vhost_driver_set_protocol_features(const char *path,
+   uint64_t protocol_features)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket) {
+   vsocket->supported_protocol_features = protocol_features;
+   vsocket->protocol_features = protocol_features;
+   }
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_protocol_features(const char *path,
+   uint64_t *protocol_features)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   *protocol_features = vsocket->protocol_features;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   if (!vsocket) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "socket file %s is not registered yet.\n", path);
+   return -1;
+   } else {
+   return 0;
+   }
+}
+
+int rte_vhost_driver_set_queue_num(const char *path, uint16_t queue_num)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->queue_num = queue_num;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int rte_vhost_driver_get_queue_num(const char *path, uint16_t *queue_num)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   *queue_num = vsocket->queue_num;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   if (!vsocket) {
+   RTE_LOG(ERR, VHOST_CONFIG,
+   "socket file %s is not registered yet.\n", path);
+   return -1;
+   } else {
+   return 0;
+   }
+}
+
 /*
  * Register a new vhost-user socket;

[dpdk-dev] [PATCH 3/7] vhost: support selective datapath

2018-02-02 Thread Zhihong Wang

This patch introduces support for selective datapath in DPDK vhost-user lib
to enable various types of virtio-compatible devices to do data transfer
with virtio driver directly to enable acceleration. The default datapath is
the existing software implementation, more options will be available when
new engines are registered.

An engine is a group of virtio-compatible devices under a single address.
The engine driver includes:

 1. A set of engine ops is defined in rte_vdpa_eng_ops to perform engine
init, uninit, and attributes reporting.

 2. A set of device ops is defined in rte_vdpa_dev_ops for virtio devices
in the engine to do device specific operations:

 a. dev_conf: Called to configure the actual device when the virtio
device becomes ready.

 b. dev_close: Called to close the actual device when the virtio device
is stopped.

 c. vring_state_set: Called to change the state of the vring in the
actual device when vring state changes.

 d. feature_set: Called to set the negotiated features to device.

 e. migration_done: Called to allow the device to response to RARP
sending.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/Makefile   |   4 +-
 lib/librte_vhost/rte_vdpa.h | 113 +++
 lib/librte_vhost/vdpa.c | 125 
 3 files changed, 240 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vdpa.h
 create mode 100644 lib/librte_vhost/vdpa.c

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index 5d6c6ab..37044ac 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -22,9 +22,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev 
-lrte_net
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
-   vhost_user.c virtio_net.c
+   vhost_user.c virtio_net.c vdpa.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
new file mode 100644
index 000..729849b
--- /dev/null
+++ b/lib/librte_vhost/rte_vdpa.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_VDPA_H_
+#define _RTE_VDPA_H_
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include 
+#include 
+#include "rte_vhost.h"
+
+#define MAX_VDPA_ENGINE_NUM 128
+#define MAX_VDPA_NAME_LEN 128
+
+
+struct rte_vdpa_eng_addr {
+   union {
+   uint8_t __dummy[64];
+
+   struct {
+   struct rte_pci_addr pci_addr;
+   };
+   };
+};
+
+struct rte_vdpa_eng_info {
+   char name[MAX_VDPA_NAME_LEN];
+   struct rte_vdpa_eng_addr *addr;
+};
+
+struct rte_vdpa_eng_attr {
+   uint64_t features;
+   uint64_t protocol_features;
+   uint32_t queue_num;
+   uint32_t dev_num;
+};
+
+/* register/remove engine */
+typedef int (*vdpa_eng_init_t)(int eid, struct rte_vdpa_eng_addr *addr);
+typedef int (*vdpa_eng_uninit_t)(int eid);
+
+/* query info of this engine */
+typedef int (*vdpa_info_query_t)(int eid,
+   struct rte_vdpa_eng_attr *attr);
+
+/* driver configure/close the port based on connection */
+typedef int (*vdpa_dev_conf_t)(int vid);
+typedef int (*vdpa_dev_close_t)(int vid);
+
+/* enable/disable this vring */
+typedef int (*vdpa_vring_state_set_t)(int vid, int vring, int state);
+
+/* set features when changed */
+typedef int (*vdpa_feature_set_t)(int vid);
+
+/* destination operations when migration done, e.g. send rarp */
+typedef int (*vdpa_migration_done_t)(int vid);
+
+/* device ops */
+struct rte_vdpa_dev_ops {
+   vdpa_dev_conf_tdev_conf;
+   vdpa_dev_close_t   dev_close;
+   vdpa_vring_state_set_t vring_state_set;
+   vdpa_feature_set_t feature_set;
+   vdpa_migration_done_t  migration_done;
+};
+
+/* engine ops */
+struct rte_vdpa_eng_ops {
+   vdpa_eng_init_t eng_init;
+   vdpa_eng_uninit_t eng_uninit;
+   vdpa_info_query_t info_query;
+};
+
+struct rte_vdpa_eng_driver {
+   const char *name;
+   struct rte_vdpa_eng_ops eng_ops;
+   struct rte_vdpa_dev_ops dev_ops;
+} __rte_cache_aligned;
+
+struct rte_vdpa_engine {
+   struct rte_vdpa_eng_info eng_info;
+   struct rte_vdpa_eng_driver *eng_drv;
+} __rte_cache_aligned;
+
+extern struct rte_vdpa_engine *vdpa_engines[];
+extern uint32_t vdpa_engine_num;
+
+/* engine management */
+int rte_vdpa_register_engine(const char *name, struct rte_vdpa_eng_addr *addr);
+int rte_vdpa_unregister_engine(int eid);
+
+int rte_vdpa_find_engine_id(struct rte_vdpa_eng_addr *addr);
+
+int rte_vdpa_info_query(int eid, struct rte_vdpa_eng_

[dpdk-dev] [PATCH 2/7] vhost: export vhost feature definitions

2018-02-02 Thread Zhihong Wang

This patch exports vhost-user protocol features to support device driver
development.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h  |  8 
 lib/librte_vhost/vhost.h  |  4 +---
 lib/librte_vhost/vhost_user.c |  9 +
 lib/librte_vhost/vhost_user.h | 20 +++-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 12cf48f..6c92580 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -29,6 +29,14 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 
+#define RTE_VHOST_USER_PROTOCOL_F_MQ   0
+#define RTE_VHOST_USER_PROTOCOL_F_LOG_SHMFD1
+#define RTE_VHOST_USER_PROTOCOL_F_RARP 2
+#define RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK3
+#define RTE_VHOST_USER_PROTOCOL_F_NET_MTU  4
+#define RTE_VHOST_USER_PROTOCOL_F_SLAVE_REQ5
+#define RTE_VHOST_USER_F_PROTOCOL_FEATURES 30
+
 /**
  * Information relating to memory regions including offsets to
  * addresses in QEMUs memory file.
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 646aad3..09a745d 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -172,8 +172,6 @@ struct vhost_msg {
  #define VIRTIO_F_VERSION_1 32
 #endif
 
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
 /* Features supported by this builtin vhost-user net driver. */
 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
(1ULL << VIRTIO_F_ANY_LAYOUT) | \
@@ -183,7 +181,7 @@ struct vhost_msg {
(1ULL << VIRTIO_NET_F_MQ)  | \
(1ULL << VIRTIO_F_VERSION_1)   | \
(1ULL << VHOST_F_LOG_ALL)  | \
-   (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+   (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES) | \
(1ULL << VIRTIO_NET_F_GSO) | \
(1ULL << VIRTIO_NET_F_HOST_TSO4) | \
(1ULL << VIRTIO_NET_F_HOST_TSO6) | \
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 87ba267..b1762e6 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -510,7 +510,7 @@ vhost_user_set_vring_addr(struct virtio_net **pdev, 
VhostUserMsg *msg)
vring_invalidate(dev, vq);
 
if (vq->enabled && (dev->features &
-   (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
+   (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES))) {
dev = translate_ring_addresses(dev, msg->payload.state.index);
if (!dev)
return -1;
@@ -847,11 +847,11 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, 
struct VhostUserMsg *pmsg)
vq = dev->virtqueue[file.index];
 
/*
-* When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
+* When RTE_VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
 * the ring starts already enabled. Otherwise, it is enabled via
 * the SET_VRING_ENABLE message.
 */
-   if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)))
+   if (!(dev->features & (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES)))
vq->enabled = 1;
 
if (vq->kickfd >= 0)
@@ -961,7 +961,8 @@ vhost_user_get_protocol_features(struct virtio_net *dev)
 * Qemu versions (from v2.7.0 to v2.9.0).
 */
if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
-   protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK);
+   protocol_features &=
+   ~(1ULL << RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK);
 
return protocol_features;
 }
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index d4bd604..58e475d 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -14,19 +14,13 @@
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
-#define VHOST_USER_PROTOCOL_F_MQ   0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
-#define VHOST_USER_PROTOCOL_F_RARP 2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
-#define VHOST_USER_PROTOCOL_F_NET_MTU 4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-
-#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
-(1ULL << 
VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
-(1ULL << VHOST_USER_PROTOCOL_F_RARP) | 
\
-(1ULL << 
VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
-

[dpdk-dev] [PATCH 4/7] vhost: add apis for datapath configuration

2018-02-02 Thread Zhihong Wang

This patch adds APIs for datapath configuration. The eid and did of the
vhost-user socket can be configured to identify the actual device.

When the default software datapath is used, eid and did are set to -1.
When alternative datapath is used, eid and did are set by app to specify
which device to use. Each vhost-user socket can have only 1 connection in
this case.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h | 68 
 lib/librte_vhost/socket.c| 65 ++
 lib/librte_vhost/vhost.c | 44 
 lib/librte_vhost/vhost.h | 10 +++
 4 files changed, 187 insertions(+)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 6c92580..03f4ed1 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -178,6 +178,50 @@ int rte_vhost_driver_register(const char *path, uint64_t 
flags);
 int rte_vhost_driver_unregister(const char *path);
 
 /**
+ * Set the engine id, enforce single connection per socket
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param eid
+ *  Engine id
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_vdpa_eid(const char *path, int eid);
+
+/**
+ * Set the device id, enforce single connection per socket
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @param did
+ *  Device id
+ * @return
+ *  0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_vdpa_did(const char *path, int did);
+
+/**
+ * Get the engine id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  Engine id, -1 on failure
+ */
+int rte_vhost_driver_get_vdpa_eid(const char *path);
+
+/**
+ * Get the device id
+ *
+ * @param path
+ *  The vhost-user socket file path
+ * @return
+ *  Device id, -1 on failure
+ */
+int rte_vhost_driver_get_vdpa_did(const char *path);
+
+/**
  * Set the feature bits the vhost-user driver supports.
  *
  * @param path
@@ -492,6 +536,30 @@ int rte_vhost_vring_call(int vid, uint16_t vring_idx);
  */
 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
 
+/**
+ * Get vdpa engine id for vhost device.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param eid
+ *  engine id
+ * @return
+ *  engine id
+ */
+int rte_vhost_get_vdpa_eid(int vid);
+
+/**
+ * Get vdpa device id for vhost device.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param did
+ *  device id
+ * @return
+ *  device id
+ */
+int rte_vhost_get_vdpa_did(int vid);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index e1d0036..c4f90af 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -54,6 +54,13 @@ struct vhost_user_socket {
uint64_t protocol_features;
uint16_t queue_num;
 
+   /* engine and device id to identify a certain port on a specific
+* backend, both are set to -1 for sw. when used, one socket can
+* have 1 connection only.
+*/
+   int eid;
+   int did;
+
struct vhost_device_ops const *notify_ops;
 };
 
@@ -524,6 +531,64 @@ find_vhost_user_socket(const char *path)
 }
 
 int
+rte_vhost_driver_set_vdpa_eid(const char *path, int eid)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->eid = eid;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_vdpa_did(const char *path, int did)
+{
+   struct vhost_user_socket *vsocket;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   vsocket->did = did;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_vdpa_eid(const char *path)
+{
+   struct vhost_user_socket *vsocket;
+   int eid = -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   eid = vsocket->eid;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return eid;
+}
+
+int
+rte_vhost_driver_get_vdpa_did(const char *path)
+{
+   struct vhost_user_socket *vsocket;
+   int did = -1;
+
+   pthread_mutex_lock(&vhost_user.mutex);
+   vsocket = find_vhost_user_socket(path);
+   if (vsocket)
+   did = vsocket->did;
+   pthread_mutex_unlock(&vhost_user.mutex);
+
+   return did;
+}
+
+int
 rte_vhost_driver_disable_features(const char *path, uint64_t features)
 {
struct vhost_user_socket *vsocket;
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 1dd9adb..2dff199 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -288,6 +288,8 @@ vhost_new_device(void)
vhost_devices[i] = dev;

[dpdk-dev] [PATCH 6/7] vhost: get callfd before device setup

2018-02-02 Thread Zhihong Wang

From: Xiao Wang 

This patch is to make sure device is configured with all needed guest
info. According to QEMU vhost message sequence, the real callfd comes
just before SET_VRING_ENABLE.

Signed-off-by: Xiao Wang 
---
 lib/librte_vhost/vhost_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 05b53fa..3fe1b3d 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -1547,7 +1547,8 @@ vhost_user_msg_handler(int vid, int fd)
send_vhost_reply(fd, &msg);
}
 
-   if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
+   if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)
+   && msg.request.master == VHOST_USER_SET_VRING_ENABLE) {
dev->flags |= VIRTIO_DEV_READY;
 
if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
-- 
2.7.5

[dpdk-dev] [PATCH 7/7] vhost: export new apis

2018-02-02 Thread Zhihong Wang

This patch exports new APIs as experimental.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vdpa.h| 16 +++-
 lib/librte_vhost/rte_vhost.h   | 30 --
 lib/librte_vhost/rte_vhost_version.map | 18 ++
 3 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/lib/librte_vhost/rte_vdpa.h b/lib/librte_vhost/rte_vdpa.h
index 729849b..f6f6d0a 100644
--- a/lib/librte_vhost/rte_vdpa.h
+++ b/lib/librte_vhost/rte_vdpa.h
@@ -93,15 +93,21 @@ extern struct rte_vdpa_engine *vdpa_engines[];
 extern uint32_t vdpa_engine_num;
 
 /* engine management */
-int rte_vdpa_register_engine(const char *name, struct rte_vdpa_eng_addr *addr);
-int rte_vdpa_unregister_engine(int eid);
+int __rte_experimental
+rte_vdpa_register_engine(const char *name, struct rte_vdpa_eng_addr *addr);
 
-int rte_vdpa_find_engine_id(struct rte_vdpa_eng_addr *addr);
+int __rte_experimental
+rte_vdpa_unregister_engine(int eid);
 
-int rte_vdpa_info_query(int eid, struct rte_vdpa_eng_attr *attr);
+int __rte_experimental
+rte_vdpa_find_engine_id(struct rte_vdpa_eng_addr *addr);
+
+int __rte_experimental
+rte_vdpa_info_query(int eid, struct rte_vdpa_eng_attr *attr);
 
 /* driver register api */
-void rte_vdpa_register_driver(struct rte_vdpa_eng_driver *drv);
+void __rte_experimental
+rte_vdpa_register_driver(struct rte_vdpa_eng_driver *drv);
 
 #define RTE_VDPA_REGISTER_DRIVER(nm, drv) \
 RTE_INIT(vdpainitfn_ ##nm); \
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index 03f4ed1..dc38566 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -187,7 +187,8 @@ int rte_vhost_driver_unregister(const char *path);
  * @return
  *  0 on success, -1 on failure
  */
-int rte_vhost_driver_set_vdpa_eid(const char *path, int eid);
+int __rte_experimental
+rte_vhost_driver_set_vdpa_eid(const char *path, int eid);
 
 /**
  * Set the device id, enforce single connection per socket
@@ -199,7 +200,8 @@ int rte_vhost_driver_set_vdpa_eid(const char *path, int 
eid);
  * @return
  *  0 on success, -1 on failure
  */
-int rte_vhost_driver_set_vdpa_did(const char *path, int did);
+int __rte_experimental
+rte_vhost_driver_set_vdpa_did(const char *path, int did);
 
 /**
  * Get the engine id
@@ -209,7 +211,8 @@ int rte_vhost_driver_set_vdpa_did(const char *path, int 
did);
  * @return
  *  Engine id, -1 on failure
  */
-int rte_vhost_driver_get_vdpa_eid(const char *path);
+int __rte_experimental
+rte_vhost_driver_get_vdpa_eid(const char *path);
 
 /**
  * Get the device id
@@ -219,7 +222,8 @@ int rte_vhost_driver_get_vdpa_eid(const char *path);
  * @return
  *  Device id, -1 on failure
  */
-int rte_vhost_driver_get_vdpa_did(const char *path);
+int __rte_experimental
+rte_vhost_driver_get_vdpa_did(const char *path);
 
 /**
  * Set the feature bits the vhost-user driver supports.
@@ -243,7 +247,8 @@ int rte_vhost_driver_set_features(const char *path, 
uint64_t features);
  * @return
  *  0 on success, -1 on failure
  */
-int rte_vhost_driver_get_protocol_features(const char *path,
+int __rte_experimental
+rte_vhost_driver_get_protocol_features(const char *path,
uint64_t *protocol_features);
 
 /**
@@ -256,7 +261,8 @@ int rte_vhost_driver_get_protocol_features(const char *path,
  * @return
  *  0 on success, -1 on failure
  */
-int rte_vhost_driver_set_protocol_features(const char *path,
+int __rte_experimental
+rte_vhost_driver_set_protocol_features(const char *path,
uint64_t protocol_features);
 
 /**
@@ -269,7 +275,8 @@ int rte_vhost_driver_set_protocol_features(const char *path,
  * @return
  *  0 on success, -1 on failure
  */
-int rte_vhost_driver_get_queue_num(const char *path, uint16_t *queue_num);
+int __rte_experimental
+rte_vhost_driver_get_queue_num(const char *path, uint16_t *queue_num);
 
 /**
  * Set the queue number the vhost-user driver supports.
@@ -281,7 +288,8 @@ int rte_vhost_driver_get_queue_num(const char *path, 
uint16_t *queue_num);
  * @return
  *  0 on success, -1 on failure
  */
-int rte_vhost_driver_set_queue_num(const char *path, uint16_t queue_num);
+int __rte_experimental
+rte_vhost_driver_set_queue_num(const char *path, uint16_t queue_num);
 
 /**
  * Enable vhost-user driver features.
@@ -546,7 +554,8 @@ uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
  * @return
  *  engine id
  */
-int rte_vhost_get_vdpa_eid(int vid);
+int __rte_experimental
+rte_vhost_get_vdpa_eid(int vid);
 
 /**
  * Get vdpa device id for vhost device.
@@ -558,7 +567,8 @@ int rte_vhost_get_vdpa_eid(int vid);
  * @return
  *  device id
  */
-int rte_vhost_get_vdpa_did(int vid);
+int __rte_experimental
+rte_vhost_get_vdpa_did(int vid);
 
 #ifdef __cplusplus
 }
diff --git a/lib/librte_vhost/rte_vhost_version.map 
b/lib/librte_vhost/rte_vhost_version.map
index df01031..de585df 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -59,3 +59,21 @@ DPDK_18.02

[dpdk-dev] [PATCH 5/7] vhost: adapt vhost lib for selective datapath

2018-02-02 Thread Zhihong Wang

This patch adapts vhost lib for selective datapath by calling device ops
at the corresponding stage.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/socket.c |  3 +++
 lib/librte_vhost/vhost.c  |  5 +
 lib/librte_vhost/vhost_user.c | 48 +++
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index c4f90af..8296e4b 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -205,6 +205,9 @@ vhost_user_add_connection(int fd, struct vhost_user_socket 
*vsocket)
size = strnlen(vsocket->path, PATH_MAX);
vhost_set_ifname(vid, vsocket->path, size);
 
+   vhost_set_vdpa_eid(vid, vsocket->eid);
+   vhost_set_vdpa_did(vid, vsocket->did);
+
if (vsocket->dequeue_zero_copy)
vhost_enable_dequeue_zero_copy(vid);
 
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 2dff199..1a3ddd5 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -302,11 +302,16 @@ void
 vhost_destroy_device(int vid)
 {
struct virtio_net *dev = get_device(vid);
+   int eid = dev->eid;
 
if (dev == NULL)
return;
 
if (dev->flags & VIRTIO_DEV_RUNNING) {
+   if (eid >= 0 && vdpa_engines[eid] &&
+   vdpa_engines[eid]->eng_drv &&
+   vdpa_engines[eid]->eng_drv->dev_ops.dev_close)
+   vdpa_engines[eid]->eng_drv->dev_ops.dev_close(dev->vid);
dev->flags &= ~VIRTIO_DEV_RUNNING;
dev->notify_ops->destroy_device(vid);
}
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index b1762e6..05b53fa 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -116,7 +116,13 @@ vhost_user_set_owner(void)
 static int
 vhost_user_reset_owner(struct virtio_net *dev)
 {
+   int eid = dev->eid;
+
if (dev->flags & VIRTIO_DEV_RUNNING) {
+   if (eid >= 0 && vdpa_engines[eid] &&
+   vdpa_engines[eid]->eng_drv &&
+   vdpa_engines[eid]->eng_drv->dev_ops.dev_close)
+   vdpa_engines[eid]->eng_drv->dev_ops.dev_close(dev->vid);
dev->flags &= ~VIRTIO_DEV_RUNNING;
dev->notify_ops->destroy_device(dev->vid);
}
@@ -157,6 +163,7 @@ static int
 vhost_user_set_features(struct virtio_net *dev, uint64_t features)
 {
uint64_t vhost_features = 0;
+   int eid = dev->eid;
 
rte_vhost_driver_get_features(dev->ifname, &vhost_features);
if (features & ~vhost_features) {
@@ -186,6 +193,11 @@ vhost_user_set_features(struct virtio_net *dev, uint64_t 
features)
dev->notify_ops->features_changed(dev->vid, features);
}
 
+   if (eid >= 0 && vdpa_engines[eid] &&
+   vdpa_engines[eid]->eng_drv &&
+   vdpa_engines[eid]->eng_drv->dev_ops.feature_set)
+   vdpa_engines[eid]->eng_drv->dev_ops.feature_set(dev->vid);
+
dev->features = features;
if (dev->features &
((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) 
{
@@ -883,9 +895,14 @@ vhost_user_get_vring_base(struct virtio_net *dev,
  VhostUserMsg *msg)
 {
struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+   int eid = dev->eid;
 
/* We have to stop the queue (virtio) if it is running. */
if (dev->flags & VIRTIO_DEV_RUNNING) {
+   if (eid >= 0 && vdpa_engines[eid] &&
+   vdpa_engines[eid]->eng_drv &&
+   vdpa_engines[eid]->eng_drv->dev_ops.dev_close)
+   vdpa_engines[eid]->eng_drv->dev_ops.dev_close(dev->vid);
dev->flags &= ~VIRTIO_DEV_RUNNING;
dev->notify_ops->destroy_device(dev->vid);
}
@@ -928,16 +945,24 @@ vhost_user_set_vring_enable(struct virtio_net *dev,
VhostUserMsg *msg)
 {
int enable = (int)msg->payload.state.num;
+   int index = (int)msg->payload.state.index;
+   int eid = dev->eid;
 
RTE_LOG(INFO, VHOST_CONFIG,
"set queue enable: %d to qp idx: %d\n",
-   enable, msg->payload.state.index);
+   enable, index);
+
+   if (eid >= 0 && vdpa_engines[eid] &&
+   vdpa_engines[eid]->eng_drv &&
+   vdpa_engines[eid]->eng_drv->

[dpdk-dev] [PATCH v2 1/6] vhost: export vhost feature definitions

2018-03-05 Thread Zhihong Wang

This patch exports vhost-user protocol features to support device driver
development.

Signed-off-by: Zhihong Wang 
---
 lib/librte_vhost/rte_vhost.h  |  8 
 lib/librte_vhost/vhost.h  |  4 +---
 lib/librte_vhost/vhost_user.c |  9 +
 lib/librte_vhost/vhost_user.h | 20 +++-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d33206997..b05162366 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -29,6 +29,14 @@ extern "C" {
 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY   (1ULL << 2)
 #define RTE_VHOST_USER_IOMMU_SUPPORT   (1ULL << 3)
 
+#define RTE_VHOST_USER_PROTOCOL_F_MQ   0
+#define RTE_VHOST_USER_PROTOCOL_F_LOG_SHMFD1
+#define RTE_VHOST_USER_PROTOCOL_F_RARP 2
+#define RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK3
+#define RTE_VHOST_USER_PROTOCOL_F_NET_MTU  4
+#define RTE_VHOST_USER_PROTOCOL_F_SLAVE_REQ5
+#define RTE_VHOST_USER_F_PROTOCOL_FEATURES 30
+
 /**
  * Information relating to memory regions including offsets to
  * addresses in QEMUs memory file.
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 58aec2e0d..a0b0520e2 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -174,8 +174,6 @@ struct vhost_msg {
  #define VIRTIO_F_VERSION_1 32
 #endif
 
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
 /* Features supported by this builtin vhost-user net driver. */
 #define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
(1ULL << VIRTIO_F_ANY_LAYOUT) | \
@@ -185,7 +183,7 @@ struct vhost_msg {
(1ULL << VIRTIO_NET_F_MQ)  | \
(1ULL << VIRTIO_F_VERSION_1)   | \
(1ULL << VHOST_F_LOG_ALL)  | \
-   (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+   (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES) | \
(1ULL << VIRTIO_NET_F_GSO) | \
(1ULL << VIRTIO_NET_F_HOST_TSO4) | \
(1ULL << VIRTIO_NET_F_HOST_TSO6) | \
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 5c5361066..c93e48e4d 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -527,7 +527,7 @@ vhost_user_set_vring_addr(struct virtio_net **pdev, 
VhostUserMsg *msg)
vring_invalidate(dev, vq);
 
if (vq->enabled && (dev->features &
-   (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
+   (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES))) {
dev = translate_ring_addresses(dev, msg->payload.addr.index);
if (!dev)
return -1;
@@ -897,11 +897,11 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, 
struct VhostUserMsg *pmsg)
vq = dev->virtqueue[file.index];
 
/*
-* When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
+* When RTE_VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
 * the ring starts already enabled. Otherwise, it is enabled via
 * the SET_VRING_ENABLE message.
 */
-   if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)))
+   if (!(dev->features & (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES)))
vq->enabled = 1;
 
if (vq->kickfd >= 0)
@@ -1012,7 +1012,8 @@ vhost_user_get_protocol_features(struct virtio_net *dev,
 * Qemu versions (from v2.7.0 to v2.9.0).
 */
if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
-   protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK);
+   protocol_features &=
+   ~(1ULL << RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK);
 
msg->payload.u64 = protocol_features;
msg->size = sizeof(msg->payload.u64);
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 0fafbe6e0..066e772dd 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -14,19 +14,13 @@
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
-#define VHOST_USER_PROTOCOL_F_MQ   0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD1
-#define VHOST_USER_PROTOCOL_F_RARP 2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK3
-#define VHOST_USER_PROTOCOL_F_NET_MTU 4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-
-#define VHOST_USER_PROTOCOL_FEATURES   ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
-(1ULL << 
VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
-(1ULL << VHOST_USER_PROTOCOL_F_RARP) | 
\
-

1 2 >

1 - 100 of 180 matches

Mail list logo