Signed-off-by: David Miller <dmiller...@gmail.com>
Reviewed-by: Mathew S Thoennes <tar...@us.ibm.com>
---
 app/test-acl/main.c                          |   4 +
 app/test-pmd/config.c                        |  12 +-
 app/test/test_acl.c                          |   1 +
 app/test/test_atomic.c                       |   7 +-
 app/test/test_cmdline.c                      |   6 +-
 app/test/test_cmdline_ipaddr.c               |  11 +
 app/test/test_cmdline_num.c                  | 110 ++++
 app/test/test_hash_functions.c               |  29 +
 app/test/test_xmmt_ops.h                     |  14 +
 buildtools/pmdinfogen.py                     |  11 +-
 config/meson.build                           |   2 +
 config/s390x/meson.build                     |  51 ++
 config/s390x/s390x_linux_clang_ubuntu        |  19 +
 doc/guides/nics/features/i40e.ini            |   1 +
 drivers/common/mlx5/mlx5_common.h            |   9 +
 drivers/net/i40e/i40e_rxtx_vec_s390x.c       | 630 +++++++++++++++++++
 drivers/net/i40e/meson.build                 |   2 +
 drivers/net/ixgbe/ixgbe_rxtx.c               |   2 +-
 drivers/net/memif/rte_eth_memif.h            |   2 +
 drivers/net/mlx5/mlx5_rx.c                   |  22 +-
 drivers/net/octeontx/base/octeontx_pki_var.h |   6 +
 examples/l3fwd-acl/main.c                    |   4 +
 examples/l3fwd/l3fwd_em.c                    |   8 +
 examples/l3fwd/l3fwd_lpm_s390x.h             | 137 ++++
 examples/l3fwd/l3fwd_s390x.h                 | 259 ++++++++
 lib/acl/acl_bld.c                            |   3 +
 lib/acl/acl_gen.c                            |   9 +
 lib/acl/acl_run_scalar.c                     |   8 +
 lib/acl/rte_acl.c                            |  27 +
 lib/acl/rte_acl.h                            |   5 +-
 lib/eal/s390x/include/meson.build            |  16 +
 lib/eal/s390x/include/rte_atomic.h           |  47 ++
 lib/eal/s390x/include/rte_byteorder.h        |  43 ++
 lib/eal/s390x/include/rte_cpuflags.h         |  42 ++
 lib/eal/s390x/include/rte_cycles.h           |  44 ++
 lib/eal/s390x/include/rte_io.h               | 184 ++++++
 lib/eal/s390x/include/rte_mcslock.h          |  18 +
 lib/eal/s390x/include/rte_memcpy.h           |  55 ++
 lib/eal/s390x/include/rte_pause.h            |  22 +
 lib/eal/s390x/include/rte_power_intrinsics.h |  20 +
 lib/eal/s390x/include/rte_prefetch.h         |  46 ++
 lib/eal/s390x/include/rte_rwlock.h           |  42 ++
 lib/eal/s390x/include/rte_spinlock.h         |  85 +++
 lib/eal/s390x/include/rte_ticketlock.h       |  18 +
 lib/eal/s390x/include/rte_vect.h             |  35 ++
 lib/eal/s390x/meson.build                    |  16 +
 lib/eal/s390x/rte_cpuflags.c                 |  91 +++
 lib/eal/s390x/rte_cycles.c                   |  11 +
 lib/eal/s390x/rte_hypervisor.c               |  11 +
 lib/eal/s390x/rte_power_intrinsics.c         |  51 ++
 lib/hash/rte_fbk_hash.h                      |   7 +
 lib/lpm/meson.build                          |   1 +
 lib/lpm/rte_lpm.h                            |   2 +
 lib/lpm/rte_lpm6.c                           |  18 +
 lib/lpm/rte_lpm_s390x.h                      | 130 ++++
 meson.build                                  |   2 +
 56 files changed, 2450 insertions(+), 18 deletions(-)
 create mode 100644 config/s390x/meson.build
 create mode 100644 config/s390x/s390x_linux_clang_ubuntu
 create mode 100644 drivers/net/i40e/i40e_rxtx_vec_s390x.c
 create mode 100644 examples/l3fwd/l3fwd_lpm_s390x.h
 create mode 100644 examples/l3fwd/l3fwd_s390x.h
 create mode 100644 lib/eal/s390x/include/meson.build
 create mode 100644 lib/eal/s390x/include/rte_atomic.h
 create mode 100644 lib/eal/s390x/include/rte_byteorder.h
 create mode 100644 lib/eal/s390x/include/rte_cpuflags.h
 create mode 100644 lib/eal/s390x/include/rte_cycles.h
 create mode 100644 lib/eal/s390x/include/rte_io.h
 create mode 100644 lib/eal/s390x/include/rte_mcslock.h
 create mode 100644 lib/eal/s390x/include/rte_memcpy.h
 create mode 100644 lib/eal/s390x/include/rte_pause.h
 create mode 100644 lib/eal/s390x/include/rte_power_intrinsics.h
 create mode 100644 lib/eal/s390x/include/rte_prefetch.h
 create mode 100644 lib/eal/s390x/include/rte_rwlock.h
 create mode 100644 lib/eal/s390x/include/rte_spinlock.h
 create mode 100644 lib/eal/s390x/include/rte_ticketlock.h
 create mode 100644 lib/eal/s390x/include/rte_vect.h
 create mode 100644 lib/eal/s390x/meson.build
 create mode 100644 lib/eal/s390x/rte_cpuflags.c
 create mode 100644 lib/eal/s390x/rte_cycles.c
 create mode 100644 lib/eal/s390x/rte_hypervisor.c
 create mode 100644 lib/eal/s390x/rte_power_intrinsics.c
 create mode 100644 lib/lpm/rte_lpm_s390x.h

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index 06e3847ab9..1f567c5359 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -83,6 +83,10 @@ static const struct acl_alg acl_alg[] = {
                .name = "altivec",
                .alg = RTE_ACL_CLASSIFY_ALTIVEC,
        },
+       {
+               .name = "s390x",
+               .alg = RTE_ACL_CLASSIFY_S390X,
+       },
        {
                .name = "avx512x16",
                .alg = RTE_ACL_CLASSIFY_AVX512X16,
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index cc8e7aa138..2a863f3d39 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -245,9 +245,9 @@ nic_stats_display(portid_t port_id)
        static uint64_t prev_bytes_tx[RTE_MAX_ETHPORTS];
        static uint64_t prev_ns[RTE_MAX_ETHPORTS];
        struct timespec cur_time;
-       uint64_t diff_pkts_rx, diff_pkts_tx, diff_bytes_rx, diff_bytes_tx,
-                                                               diff_ns;
-       uint64_t mpps_rx, mpps_tx, mbps_rx, mbps_tx;
+    __uint128_t diff_pkts_rx, diff_pkts_tx, diff_bytes_rx, diff_bytes_tx,
+            diff_ns;
+    __uint128_t mpps_rx, mpps_tx, mbps_rx, mbps_tx;
        struct rte_eth_stats stats;
 
        static const char *nic_stats_border = "########################";
@@ -302,9 +302,9 @@ nic_stats_display(portid_t port_id)
                (double)diff_bytes_tx / diff_ns * NS_PER_SEC : 0;
 
        printf("\n  Throughput (since last show)\n");
-       printf("  Rx-pps: %12"PRIu64"          Rx-bps: %12"PRIu64"\n  Tx-pps: 
%12"
-              PRIu64"          Tx-bps: %12"PRIu64"\n", mpps_rx, mbps_rx * 8,
-              mpps_tx, mbps_tx * 8);
+    printf("  Rx-pps: %12llu          Rx-bps: %12llu \n  Tx-pps: %12llu        
  Tx-bps: %12llu \n",
+           (unsigned long long) mpps_rx, (unsigned long long) mbps_rx * 8,
+           (unsigned long long) mpps_tx, (unsigned long long) mbps_tx * 8);
 
        if (xstats_display_num > 0)
                nic_xstats_display_periodic(port_id);
diff --git a/app/test/test_acl.c b/app/test/test_acl.c
index 4d51098925..da16365294 100644
--- a/app/test/test_acl.c
+++ b/app/test/test_acl.c
@@ -351,6 +351,7 @@ test_classify_run(struct rte_acl_ctx *acx, struct 
ipv4_7tuple test_data[],
                RTE_ACL_CLASSIFY_AVX2,
                RTE_ACL_CLASSIFY_NEON,
                RTE_ACL_CLASSIFY_ALTIVEC,
+        RTE_ACL_CLASSIFY_S390X,
                RTE_ACL_CLASSIFY_AVX512X16,
                RTE_ACL_CLASSIFY_AVX512X32,
        };
diff --git a/app/test/test_atomic.c b/app/test/test_atomic.c
index e4b997827e..37ece78425 100644
--- a/app/test/test_atomic.c
+++ b/app/test/test_atomic.c
@@ -17,6 +17,7 @@
 #include <rte_lcore.h>
 #include <rte_random.h>
 #include <rte_hash_crc.h>
+#include <rte_byteorder.h>
 
 #include "test.h"
 
@@ -351,6 +352,7 @@ volatile uint16_t token16;
 volatile uint32_t token32;
 volatile uint64_t token64;
 
+#ifndef RTE_ARCH_S390X
 static void
 build_crc8_table(void)
 {
@@ -441,6 +443,8 @@ test_atomic_exchange(__rte_unused void *arg)
 
        return 0;
 }
+#endif
+
 static int
 test_atomic(void)
 {
@@ -597,6 +601,7 @@ test_atomic(void)
        }
 #endif
 
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
        /*
         * Test 16/32/64bit atomic exchange.
         */
@@ -628,7 +633,7 @@ test_atomic(void)
                printf("Atomic exchange test failed\n");
                return -1;
        }
-
+#endif
        return 0;
 }
 REGISTER_TEST_COMMAND(atomic_autotest, test_atomic);
diff --git a/app/test/test_cmdline.c b/app/test/test_cmdline.c
index 115bee966d..e0720ff345 100644
--- a/app/test/test_cmdline.c
+++ b/app/test/test_cmdline.c
@@ -10,21 +10,21 @@
 static int
 test_cmdline(void)
 {
-       printf("Testind parsing ethernet addresses...\n");
+       printf("Testing parsing ethernet addresses...\n");
        if (test_parse_etheraddr_valid() < 0)
                return -1;
        if (test_parse_etheraddr_invalid_data() < 0)
                return -1;
        if (test_parse_etheraddr_invalid_param() < 0)
                return -1;
-       printf("Testind parsing port lists...\n");
+       printf("Testing parsing port lists...\n");
        if (test_parse_portlist_valid() < 0)
                return -1;
        if (test_parse_portlist_invalid_data() < 0)
                return -1;
        if (test_parse_portlist_invalid_param() < 0)
                return -1;
-       printf("Testind parsing numbers...\n");
+       printf("Testing parsing numbers...\n");
        if (test_parse_num_valid() < 0)
                return -1;
        if (test_parse_num_invalid_data() < 0)
diff --git a/app/test/test_cmdline_ipaddr.c b/app/test/test_cmdline_ipaddr.c
index f540063508..d950383e10 100644
--- a/app/test/test_cmdline_ipaddr.c
+++ b/app/test/test_cmdline_ipaddr.c
@@ -6,12 +6,14 @@
 #include <inttypes.h>
 
 #include <rte_string_fns.h>
+#include <rte_byteorder.h>
 
 #include <cmdline_parse.h>
 #include <cmdline_parse_ipaddr.h>
 
 #include "test_cmdline.h"
 
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define IP4(a,b,c,d) {.s_addr = (uint32_t)(((a) & 0xff) | \
                                           (((b) & 0xff) << 8) | \
                                           (((c) & 0xff) << 16)  | \
@@ -19,6 +21,15 @@
 
 #define U16_SWAP(x) \
                (((x & 0xFF) << 8) | ((x & 0xFF00) >> 8))
+#else
+#define IP4(a,b,c,d) {((uint32_t)(((a) & 0xff) << 24) | \
+                                          (((b) & 0xff) << 16) | \
+                                          (((c) & 0xff) << 8)  | \
+                                          ((d) & 0xff))}
+
+#define U16_SWAP(x) x
+
+#endif
 
 /* create IPv6 address, swapping bytes where needed */
 #ifndef s6_addr16
diff --git a/app/test/test_cmdline_num.c b/app/test/test_cmdline_num.c
index 9276de59bd..a710109707 100644
--- a/app/test/test_cmdline_num.c
+++ b/app/test/test_cmdline_num.c
@@ -10,6 +10,7 @@
 
 #include <cmdline_parse.h>
 #include <cmdline_parse_num.h>
+#include <rte_byteorder.h>
 
 #include "test_cmdline.h"
 
@@ -438,6 +439,48 @@ test_parse_num_valid(void)
                        /* check if result matches what it should have matched
                         * since unsigned numbers don't care about number of 
bits, we can just convert
                         * everything to uint64_t without any worries. */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+            switch (type) {
+                case RTE_UINT8:
+                {
+                    uint8_t *temp = (uint8_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_UINT16:
+                {
+                    uint16_t *temp = (uint16_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_UINT32:
+                {
+                    uint32_t *temp = (uint32_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_INT8:
+                {
+                    int8_t *temp = (int8_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_INT16:
+                {
+                    int16_t *temp = (int16_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_INT32:
+                {
+                    int32_t *temp = (int32_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                default:
+                    break;
+            }
+#endif
                        if (ret > 0 && num_valid_positive_strs[i].result != 
result) {
                                printf("Error: parsing %s as %s failed: result 
mismatch!\n",
                                                num_valid_positive_strs[i].str, 
buf);
@@ -467,6 +510,7 @@ test_parse_num_valid(void)
                         * the result is signed in this case, so we have to 
account for that */
                        if (ret > 0) {
                                /* detect negative */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
                                switch (type) {
                                case RTE_INT8:
                                        result = (int8_t) result;
@@ -480,6 +524,30 @@ test_parse_num_valid(void)
                                default:
                                        break;
                                }
+#else
+                switch (type) {
+                               case RTE_INT8:
+                               {
+                                       int8_t *temp = (int8_t *)&result;
+                                       result = *temp;
+                                       break;
+                               }
+                               case RTE_INT16:
+                               {
+                                       int16_t *temp = (int16_t *)&result;
+                                       result = *temp;
+                                       break;
+                               }
+                               case RTE_INT32:
+                               {
+                                       int32_t *temp = (int32_t *)&result;
+                                       result = *temp;
+                                       break;
+                               }
+                               default:
+                                       break;
+                               }
+#endif
                                if (num_valid_negative_strs[i].result == 
(int64_t) result)
                                        continue;
                                printf("Error: parsing %s as %s failed: result 
mismatch!\n",
@@ -516,6 +584,48 @@ test_parse_num_valid(void)
                        /* check if result matches what it should have matched
                         * since unsigned numbers don't care about number of 
bits, we can just convert
                         * everything to uint64_t without any worries. */
+#if RTE_BYTE_ORDER == RTE_BIG_ENDIAN
+            switch (type) {
+                case RTE_UINT8:
+                {
+                    uint8_t *temp = (uint8_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_UINT16:
+                {
+                    uint16_t *temp = (uint16_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_UINT32:
+                {
+                    uint32_t *temp = (uint32_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_INT8:
+                {
+                    int8_t *temp = (int8_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_INT16:
+                {
+                    int16_t *temp = (int16_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                case RTE_INT32:
+                {
+                    int32_t *temp = (int32_t *)&result;
+                    result = *temp;
+                    break;
+                }
+                default:
+                    break;
+            }
+#endif
                        if (ret > 0 && num_garbage_positive_strs[i].result != 
result) {
                                printf("Error: parsing %s as %s failed: result 
mismatch!\n",
                                                
num_garbage_positive_strs[i].str, buf);
diff --git a/app/test/test_hash_functions.c b/app/test/test_hash_functions.c
index 76d51b6e71..b387d0eabb 100644
--- a/app/test/test_hash_functions.c
+++ b/app/test/test_hash_functions.c
@@ -25,6 +25,7 @@
  * e.g.: key size = 4, key = 0x03020100
  *       key size = 8, key = 0x0706050403020100
  */
+#if !defined(RTE_ARCH_S390X)
 static uint32_t hash_values_jhash[2][12] = {{
        0x8ba9414b, 0xdf0d39c9,
        0xe4cf1d42, 0xd4ccb93c, 0x5e84eafc, 0x21362cfe,
@@ -51,6 +52,34 @@ static uint32_t hash_values_crc[2][12] = {{
        0x789c104f, 0x53028d3e
 }
 };
+#else
+static uint32_t hash_values_jhash[2][12] = {{
+       0x8ba9414b, 0x8a2f8eb,
+       0x55dcd60b, 0xf0b95bfe, 0x1a28d94c, 0x003d8f00,
+       0x84c90b2c, 0x24b83acf, 0x5e16af2f, 0x751c9f59,
+       0x665b8254, 0x6e347c81
+},
+{
+       0x5c62c303, 0xb21d4b7b,
+       0xa33cdfcf, 0x47cf3d14, 0x1cae829f, 0x1253a9ea,
+       0x7171efd1, 0xcef21db0, 0x3df3f5fe, 0x35fd67d2,
+       0x2922cbc4, 0xeaee5c5c
+}
+};
+static uint32_t hash_values_crc[2][12] = {{
+       0x00000000, 0x13a29877,
+       0x3eef4343, 0xb6719589, 0x938d3d79, 0xed93196b,
+       0xe710a46c, 0x81f7ab71, 0x702bc9ee, 0x26c72488,
+       0x2e7092a9, 0xf2fbc80b
+},
+{
+       0xbdfd3980, 0x91e95e36,
+       0x37765e57, 0x6559eb17, 0x49c8a164, 0x18daa0d3,
+       0x67065980, 0x62f966d0, 0x4e28a2a0, 0xe342d18f,
+       0x1518c680, 0xebe8026b
+}
+};
+#endif
 
 
/*******************************************************************************
  * Hash function performance test configuration section. Each performance test
diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h
index 3a82d5ecac..a11f759af4 100644
--- a/app/test/test_xmmt_ops.h
+++ b/app/test/test_xmmt_ops.h
@@ -49,6 +49,20 @@ vect_set_epi32(int i3, int i2, int i1, int i0)
        return data;
 }
 
+#elif defined(RTE_ARCH_S390X)
+
+/* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/
+#define vect_loadu_sil128(p) vec_xld2(0, (signed int *)p)
+
+/* sets the 4 signed 32-bit integer values and returns the xmm_t variable */
+static __rte_always_inline xmm_t
+vect_set_epi32(int i3, int i2, int i1, int i0)
+{
+       xmm_t data = (xmm_t){i0, i1, i2, i3};
+
+       return data;
+}
+
 #endif
 
 #endif /* _TEST_XMMT_OPS_H_ */
diff --git a/buildtools/pmdinfogen.py b/buildtools/pmdinfogen.py
index 2a44f17bda..10467c1a3e 100755
--- a/buildtools/pmdinfogen.py
+++ b/buildtools/pmdinfogen.py
@@ -16,8 +16,15 @@
 except ImportError:
     pass
 
-import coff
+try:
+    import coff
+except TypeError:
+    pass
 
+def decode_asciiz(data):
+    index = data.find(b'\x00')
+    end = index if index >= 0 else len(data)
+    return data[:end].decode()
 
 class ELFSymbol:
     def __init__(self, image, symbol):
@@ -28,7 +35,7 @@ def __init__(self, image, symbol):
     def string_value(self):
         size = self._symbol["st_size"]
         value = self.get_value(0, size)
-        return coff.decode_asciiz(value)  # not COFF-specific
+        return decode_asciiz(value)  # not COFF-specific
 
     def get_value(self, offset, size):
         section = self._symbol["st_shndx"]
diff --git a/config/meson.build b/config/meson.build
index 7134e80e8d..407aa1483d 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -121,6 +121,8 @@ if cpu_instruction_set == 'generic'
         cpu_instruction_set = 'generic'
     elif host_machine.cpu_family().startswith('ppc')
         cpu_instruction_set = 'power8'
+       elif host_machine.cpu_family().startswith('s390x')
+               machine = 'z13'
     endif
 endif
 
diff --git a/config/s390x/meson.build b/config/s390x/meson.build
new file mode 100644
index 0000000000..b15e74ba44
--- /dev/null
+++ b/config/s390x/meson.build
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# (c) Copyright IBM Corp. 2019, 2020
+
+if not dpdk_conf.get('RTE_ARCH_64')
+       error('Only 64-bit compiles are supported for this platform type')
+endif
+dpdk_conf.set('RTE_ARCH', 's390x')
+dpdk_conf.set('RTE_ARCH_S390X', 1)
+dpdk_conf.set('RTE_FORCE_INTRINSICS', 1)
+
+# overrides specific to s390x
+dpdk_conf.set('RTE_MAX_LCORE', 256)
+dpdk_conf.set('RTE_MAX_NUMA_NODES', 32)
+dpdk_conf.set('RTE_CACHE_LINE_SIZE', 128)
+
+
+
+# default to z13
+cpu_instruction_set = 'z13'
+
+# test compiler support
+cc_march_z14 = cc.has_argument('-march=z14')
+cc_march_z15 = cc.has_argument('-march=z15')
+
+
+machine_args = ['-march=' + cpu_instruction_set, '-mtune=' + 
cpu_instruction_set]
+
+dpdk_conf.set('RTE_MACHINE','s390x')
+dpdk_conf.set('RTE_MACHINE_CPUFLAG_ZARCH', 1)   # should this be z# 13 ?
+#dpdk_conf.set('RTE_MACHINE', cpu_instruction_set)
+
+if (cc.get_define('__s390x__', args: machine_args) != '')
+    compile_time_cpuflags += ['RTE_MACHINE_CPUFLAG_ZARCH']
+endif
+
+
+# Suppress the gcc warning "note: the layout of aggregates containing
+# vectors with 4-byte alignment has changed in GCC 5".
+if (cc.get_id() == 'gcc' and cc.version().version_compare('>=10.0') and
+        cc.version().version_compare('<12.0') and 
cc.has_argument('-Wno-psabi'))
+    add_project_arguments('-Wno-psabi', language: 'c')
+endif
+
+
+
+
+
+
+
+
+
diff --git a/config/s390x/s390x_linux_clang_ubuntu 
b/config/s390x/s390x_linux_clang_ubuntu
new file mode 100644
index 0000000000..952d1ce460
--- /dev/null
+++ b/config/s390x/s390x_linux_clang_ubuntu
@@ -0,0 +1,19 @@
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+llvm-config = 'llvm-config'
+pcap-config = 'llvm-config'
+pkgconfig = 'pkg-config'
+
+[host_machine]
+system = 'linux'
+cpu_family = 's390x'
+cpu = 'z13'
+endian = 'big'
+
+[properties]
+platform = 'generic'
+c_args = ['-target', 'aarch64-linux-gnu', '--sysroot', 
'/usr/aarch64-linux-gnu']
+c_link_args = ['-target', 'aarch64-linux-gnu', '-fuse-ld=lld', 
'--gcc-toolchain=/usr']
diff --git a/doc/guides/nics/features/i40e.ini 
b/doc/guides/nics/features/i40e.ini
index dd18fec217..bc0c8b1969 100644
--- a/doc/guides/nics/features/i40e.ini
+++ b/doc/guides/nics/features/i40e.ini
@@ -50,6 +50,7 @@ x86-32               = Y
 x86-64               = Y
 ARMv8                = Y
 Power8               = Y
+s390x                = Y
 
 [rte_flow items]
 ah                   = Y
diff --git a/drivers/common/mlx5/mlx5_common.h 
b/drivers/common/mlx5/mlx5_common.h
index 63f31437da..61fd6afa02 100644
--- a/drivers/common/mlx5/mlx5_common.h
+++ b/drivers/common/mlx5/mlx5_common.h
@@ -20,6 +20,11 @@
 #include <rte_spinlock.h>
 #include <rte_os_shim.h>
 
+/* s390x pci implemenation. */
+#ifdef RTE_MACHINE_CPUFLAG_ZARCH
+#include <rte_io.h>
+#endif
+
 #include "mlx5_prm.h"
 #include "mlx5_devx_cmds.h"
 #include "mlx5_common_os.h"
@@ -358,7 +363,11 @@ mlx5_doorbell_ring(struct mlx5_uar_data *uar, uint64_t 
val, uint32_t index,
        /* Ensure ordering between DB record actual update and UAR access. */
        rte_wmb();
 #ifdef RTE_ARCH_64
+# ifndef RTE_MACHINE_CPUFLAG_ZARCH
        *uar->db = val;
+# else
+    rte_write64_relaxed(val, uar->db);
+# endif
 #else /* !RTE_ARCH_64 */
        rte_spinlock_lock(uar->sl_p);
        *(volatile uint32_t *)uar->db = val;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_s390x.c 
b/drivers/net/i40e/i40e_rxtx_vec_s390x.c
new file mode 100644
index 0000000000..1cee842ad8
--- /dev/null
+++ b/drivers/net/i40e/i40e_rxtx_vec_s390x.c
@@ -0,0 +1,630 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ * (c) Copyright IBM Corp. 2017, 2019
+ */
+
+#include <stdint.h>
+#include <vecintrin.h>
+#include <rte_ethdev_driver.h>
+#include <rte_malloc.h>
+
+#include "base/i40e_prototype.h"
+#include "base/i40e_type.h"
+#include "i40e_ethdev.h"
+#include "i40e_rxtx.h"
+#include "i40e_rxtx_vec_common.h"
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+typedef unsigned long long vector_unsigned_long_long
+       __attribute__((vector_size(2 * sizeof(unsigned long long))));
+typedef unsigned int vector_unsigned_int
+       __attribute__((vector_size(4 * sizeof(unsigned int))));
+typedef unsigned short vector_unsigned_short
+       __attribute__((vector_size(8 * sizeof(unsigned short))));
+typedef unsigned char vector_unsigned_char
+       __attribute__((vector_size(16 * sizeof(unsigned char))));
+
+
+static inline void
+i40e_rxq_rearm(struct i40e_rx_queue *rxq)
+{
+       int i;
+       uint16_t rx_id;
+       volatile union i40e_rx_desc *rxdp;
+
+       struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+       struct rte_mbuf *mb0, *mb1;
+
+       vector_unsigned_long_long hdr_room = (vector_unsigned_long_long){
+                                               RTE_PKTMBUF_HEADROOM,
+                                               RTE_PKTMBUF_HEADROOM};
+       vector_unsigned_long_long dma_addr0, dma_addr1;
+
+       rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+       /* Pull 'n' more MBUFs into the software ring */
+       if (rte_mempool_get_bulk(rxq->mp,
+                                (void *)rxep,
+                                RTE_I40E_RXQ_REARM_THRESH) < 0) {
+               if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+                   rxq->nb_rx_desc) {
+                       dma_addr0 = (vector_unsigned_long_long){};
+                       for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+                               rxep[i].mbuf = &rxq->fake_mbuf;
+                               vec_xstd2(dma_addr0, 0,
+                                       (unsigned long long *)&rxdp[i].read);
+                       }
+               }
+               rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+                       RTE_I40E_RXQ_REARM_THRESH;
+               return;
+       }
+
+       /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+       for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+               vector_unsigned_long_long vaddr0, vaddr1;
+               uintptr_t p0, p1;
+
+               mb0 = rxep[0].mbuf;
+               mb1 = rxep[1].mbuf;
+
+                /* Flush mbuf with pkt template.
+                 * Data to be rearmed is 6 bytes long.
+                 * Though, RX will overwrite ol_flags that are coming next
+                 * anyway. So overwrite whole 8 bytes with one load:
+                 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
+                 */
+               p0 = (uintptr_t)&mb0->rearm_data;
+               *(uint64_t *)p0 = rxq->mbuf_initializer;
+               p1 = (uintptr_t)&mb1->rearm_data;
+               *(uint64_t *)p1 = rxq->mbuf_initializer;
+
+               /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+               vaddr0 = vec_xld2(0, (unsigned long long *)&mb0->buf_addr);
+               vaddr1 = vec_xld2(0, (unsigned long long *)&mb1->buf_addr);
+
+               /* convert pa to dma_addr hdr/data */
+               dma_addr0 = vec_mergel(vaddr0, vaddr0);
+               dma_addr1 = vec_mergel(vaddr1, vaddr1);
+
+               /* add headroom to pa values */
+               dma_addr0 = dma_addr0 + hdr_room;
+               dma_addr1 = dma_addr1 + hdr_room;
+
+               /* flush desc with pa dma_addr */
+               vec_xstd2(dma_addr0, 0, (unsigned long long *)&rxdp++->read);
+               vec_xstd2(dma_addr1, 0, (unsigned long long *)&rxdp++->read);
+       }
+
+       rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
+       if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+               rxq->rxrearm_start = 0;
+
+       rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
+
+       rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+                            (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+       /* Update the tail pointer on the NIC */
+       I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+}
+
+static inline void
+desc_to_olflags_v(vector_unsigned_long_long descs[4], struct rte_mbuf 
**rx_pkts)
+{
+       vector_unsigned_int vlan0, vlan1, rss, l3_l4e;
+
+       /* mask everything except RSS, flow director and VLAN flags
+        * bit2 is for VLAN tag, bit11 for flow director indication
+        * bit13:12 for RSS indication.
+        */
+       const vector_unsigned_int rss_vlan_msk = (vector_unsigned_int){
+                       (int32_t)0x1c03804, (int32_t)0x1c03804,
+                       (int32_t)0x1c03804, (int32_t)0x1c03804};
+
+       /* map rss and vlan type to rss hash and vlan flag */
+       const vector_unsigned_char vlan_flags = (vector_unsigned_char){
+                       0, 0, 0, 0,
+                       PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0};
+
+       const vector_unsigned_char rss_flags = (vector_unsigned_char){
+                       0, PKT_RX_FDIR, 0, 0,
+                       0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0};
+
+       const vector_unsigned_char l3_l4e_flags = (vector_unsigned_char){
+                       0,
+                       PKT_RX_IP_CKSUM_BAD,
+                       PKT_RX_L4_CKSUM_BAD,
+                       PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
+                       PKT_RX_IP_CKSUM_BAD,
+                       PKT_RX_IP_CKSUM_BAD,
+                       PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
+                       PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
+                       0, 0, 0, 0, 0, 0, 0, 0};
+
+       vlan0 = (vector_unsigned_int)vec_mergel(descs[0], descs[1]);
+       vlan1 = (vector_unsigned_int)vec_mergel(descs[2], descs[3]);
+       vlan0 = (vector_unsigned_int)vec_mergeh(vlan0, vlan1);
+
+       vlan1 = vec_and(vlan0, rss_vlan_msk);
+       vlan0 = (vector_unsigned_int)vec_perm(vlan_flags,
+                                       (vector_unsigned_char){},
+                                       *(vector_unsigned_char *)&vlan1);
+
+       rss[0] = (uint32_t)vlan1[0] >> 11;
+       rss[1] = (uint32_t)vlan1[1] >> 11;
+       rss[2] = (uint32_t)vlan1[2] >> 11;
+       rss[3] = (uint32_t)vlan1[3] >> 11;
+       rss = (vector_unsigned_int)vec_perm(rss_flags, (vector_unsigned_char){},
+                                       *(vector_unsigned_char *)&rss);
+
+       l3_l4e[0] = (uint32_t)vlan1[0] >> 22;
+       l3_l4e[1] = (uint32_t)vlan1[1] >> 22;
+       l3_l4e[2] = (uint32_t)vlan1[2] >> 22;
+       l3_l4e[3] = (uint32_t)vlan1[3] >> 22;
+
+       l3_l4e = (vector_unsigned_int)vec_perm(l3_l4e_flags,
+                                       (vector_unsigned_char){},
+                                       *(vector_unsigned_char *)&l3_l4e);
+
+       vlan0 = vec_or(vlan0, rss);
+       vlan0 = vec_or(vlan0, l3_l4e);
+
+       rx_pkts[0]->ol_flags = (uint64_t)vlan0[2];
+       rx_pkts[1]->ol_flags = (uint64_t)vlan0[3];
+       rx_pkts[2]->ol_flags = (uint64_t)vlan0[0];
+       rx_pkts[3]->ol_flags = (uint64_t)vlan0[1];
+}
+
+#define PKTLEN_SHIFT     10
+
+static inline void
+desc_to_ptype_v(vector_unsigned_long_long descs[4], struct rte_mbuf **rx_pkts,
+               uint32_t *ptype_tbl)
+{
+       vector_unsigned_long_long ptype0 = vec_mergel(descs[0], descs[1]);
+       vector_unsigned_long_long ptype1 = vec_mergel(descs[2], descs[3]);
+
+       ptype0[0] = ptype0[0] >> 30;
+       ptype0[1] = ptype0[1] >> 30;
+
+       ptype1[0] = ptype1[0] >> 30;
+       ptype1[1] = ptype1[1] >> 30;
+
+       rx_pkts[0]->packet_type =
+               ptype_tbl[(*(vector_unsigned_char *)&ptype0)[0]];
+       rx_pkts[1]->packet_type =
+               ptype_tbl[(*(vector_unsigned_char *)&ptype0)[8]];
+       rx_pkts[2]->packet_type =
+               ptype_tbl[(*(vector_unsigned_char *)&ptype1)[0]];
+       rx_pkts[3]->packet_type =
+               ptype_tbl[(*(vector_unsigned_char *)&ptype1)[8]];
+}
+
+ /* Notice:
+  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
+  *   numbers of DD bits
+  */
+static inline uint16_t
+_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+                  uint16_t nb_pkts, uint8_t *split_packet)
+{
+       volatile union i40e_rx_desc *rxdp;
+       struct i40e_rx_entry *sw_ring;
+       uint16_t nb_pkts_recd;
+       int pos;
+       uint64_t var;
+       vector_unsigned_char shuf_msk;
+       uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+
+       vector_unsigned_short crc_adjust = (vector_unsigned_short){
+               0, 0,         /* ignore pkt_type field */
+               rxq->crc_len, /* sub crc on pkt_len */
+               0,            /* ignore high-16bits of pkt_len */
+               rxq->crc_len, /* sub crc on data_len */
+               0, 0, 0       /* ignore non-length fields */
+               };
+       vector_unsigned_long_long dd_check, eop_check;
+
+       /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
+       nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
+
+       /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
+       nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
+
+       /* Just the act of getting into the function from the application is
+        * going to cost about 7 cycles
+        */
+       rxdp = rxq->rx_ring + rxq->rx_tail;
+
+       rte_prefetch0(rxdp);
+
+       /* See if we need to rearm the RX queue - gives the prefetch a bit
+        * of time to act
+        */
+       if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
+               i40e_rxq_rearm(rxq);
+
+       /* Before we start moving massive data around, check to see if
+        * there is actually a packet available
+        */
+       if (!(rxdp->wb.qword1.status_error_len &
+                       rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
+               return 0;
+
+       /* 4 packets DD mask */
+       dd_check = (vector_unsigned_long_long){0x0000000100000001ULL,
+                                         0x0000000100000001ULL};
+
+       /* 4 packets EOP mask */
+       eop_check = (vector_unsigned_long_long){0x0000000200000002ULL,
+                                          0x0000000200000002ULL};
+
+       /* mask to shuffle from desc. to mbuf */
+       shuf_msk = (vector_unsigned_char){
+               0xFF, 0xFF,   /* pkt_type set as unknown */
+               0xFF, 0xFF,   /* pkt_type set as unknown */
+               14, 15,       /* octet 15~14, low 16 bits pkt_len */
+               0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
+               14, 15,       /* octet 15~14, 16 bits data_len */
+               2, 3,         /* octet 2~3, low 16 bits vlan_macip */
+               4, 5, 6, 7    /* octet 4~7, 32bits rss */
+               };
+
+       /* Cache is empty -> need to scan the buffer rings, but first move
+        * the next 'n' mbufs into the cache
+        */
+       sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+       /* A. load 4 packet in one loop
+        * [A*. mask out 4 unused dirty field in desc]
+        * B. copy 4 mbuf point from swring to rx_pkts
+        * C. calc the number of DD bits among the 4 packets
+        * [C*. extract the end-of-packet bit, if requested]
+        * D. fill info. from desc to mbuf
+        */
+
+       for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+                       pos += RTE_I40E_DESCS_PER_LOOP,
+                       rxdp += RTE_I40E_DESCS_PER_LOOP) {
+               vector_unsigned_long_long descs[RTE_I40E_DESCS_PER_LOOP];
+               vector_unsigned_char pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+               vector_unsigned_short staterr, sterr_tmp1, sterr_tmp2;
+               vector_unsigned_long_long mbp1, mbp2;  /* two mbuf pointer
+                                                       * in one XMM reg.
+                                                       */
+
+               /* B.1 load 1 mbuf point */
+               mbp1 = *(vector_unsigned_long_long *)&sw_ring[pos];
+               /* Read desc statuses backwards to avoid race condition */
+               /* A.1 load 4 pkts desc */
+               descs[3] = *(vector_unsigned_long_long *)(rxdp + 3);
+               rte_compiler_barrier();
+
+               /* B.2 copy 2 mbuf point into rx_pkts  */
+               *(vector_unsigned_long_long *)&rx_pkts[pos] = mbp1;
+
+               /* B.1 load 1 mbuf point */
+               mbp2 = *(vector_unsigned_long_long *)&sw_ring[pos + 2];
+
+               descs[2] = *(vector_unsigned_long_long *)(rxdp + 2);
+               rte_compiler_barrier();
+               /* B.1 load 2 mbuf point */
+               descs[1] = *(vector_unsigned_long_long *)(rxdp + 1);
+               rte_compiler_barrier();
+               descs[0] = *(vector_unsigned_long_long *)(rxdp);
+
+               /* B.2 copy 2 mbuf point into rx_pkts  */
+               *(vector_unsigned_long_long *)&rx_pkts[pos + 2] =  mbp2;
+
+               if (split_packet) {
+                       rte_mbuf_prefetch_part2(rx_pkts[pos]);
+                       rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+                       rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+                       rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+               }
+
+               /* avoid compiler reorder optimization */
+               rte_compiler_barrier();
+
+               /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
+               vector_unsigned_int len3_temp = vec_xld2(0,
+                               (unsigned int *)&descs[3]);
+               len3_temp[3] = len3_temp[3] << PKTLEN_SHIFT;
+               const vector_unsigned_int len3 = len3_temp;
+
+               vector_unsigned_int len2_temp = vec_xld2(0,
+                               (unsigned int *)&descs[2]);
+               len2_temp[3] = len2_temp[3] << PKTLEN_SHIFT;
+               const vector_unsigned_int len2 = len2_temp;
+
+               /* merge the now-aligned packet length fields back in */
+               descs[3] = (vector_unsigned_long_long)len3;
+               descs[2] = (vector_unsigned_long_long)len2;
+
+               /* D.1 pkt 3,4 convert format from desc to pktmbuf */
+               pkt_mb4 = vec_perm((vector_unsigned_char)descs[3],
+                                 (vector_unsigned_char){}, shuf_msk);
+               pkt_mb3 = vec_perm((vector_unsigned_char)descs[2],
+                                 (vector_unsigned_char){}, shuf_msk);
+
+               /* C.1 4=>2 filter staterr info only */
+               sterr_tmp2 = vec_mergel((vector_unsigned_short)descs[3],
+                                       (vector_unsigned_short)descs[2]);
+               /* C.1 4=>2 filter staterr info only */
+               sterr_tmp1 = vec_mergel((vector_unsigned_short)descs[1],
+                                       (vector_unsigned_short)descs[0]);
+               /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+               pkt_mb4 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb4
+                               - crc_adjust);
+               pkt_mb3 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb3
+                               - crc_adjust);
+
+               /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
+               const vector_unsigned_int len1 =
+                       vec_sll(vec_xld2(0, (unsigned int *)&descs[1]),
+                       (vector_unsigned_int){0, 0, 0, PKTLEN_SHIFT});
+               const vector_unsigned_int len0 =
+                       vec_sll(vec_xld2(0, (unsigned int *)&descs[0]),
+                       (vector_unsigned_int){0, 0, 0, PKTLEN_SHIFT});
+
+               /* merge the now-aligned packet length fields back in */
+               descs[1] = (vector_unsigned_long_long)len1;
+               descs[0] = (vector_unsigned_long_long)len0;
+
+               /* D.1 pkt 1,2 convert format from desc to pktmbuf */
+               pkt_mb2 = vec_perm((vector_unsigned_char)descs[1],
+                                  (vector_unsigned_char){}, shuf_msk);
+               pkt_mb1 = vec_perm((vector_unsigned_char)descs[0],
+                                  (vector_unsigned_char){}, shuf_msk);
+
+               /* C.2 get 4 pkts staterr value  */
+               staterr = (vector_unsigned_short)vec_mergeh(sterr_tmp1,
+                               sterr_tmp2);
+
+               /* D.3 copy final 3,4 data to rx_pkts */
+               vec_xstd2(pkt_mb4, 0, (unsigned char *)&rx_pkts[pos + 3]
+                       ->rx_descriptor_fields1);
+               vec_xstd2(pkt_mb3, 0, (unsigned char *)&rx_pkts[pos + 2]
+                       ->rx_descriptor_fields1);
+
+               /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+               pkt_mb2 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb2
+                               - crc_adjust);
+               pkt_mb1 = (vector_unsigned_char)((vector_unsigned_short)pkt_mb1
+                               - crc_adjust);
+
+               /* C* extract and record EOP bit */
+               if (split_packet) {
+                       vector_unsigned_char eop_shuf_mask =
+                               (vector_unsigned_char){
+                                       0xFF, 0xFF, 0xFF, 0xFF,
+                                       0xFF, 0xFF, 0xFF, 0xFF,
+                                       0xFF, 0xFF, 0xFF, 0xFF,
+                                       0x04, 0x0C, 0x00, 0x08
+                               };
+
+                       /* and with mask to extract bits, flipping 1-0 */
+                       vector_unsigned_char eop_bits =
+                               vec_and((vector_unsigned_char)vec_nor(staterr,
+                               staterr), (vector_unsigned_char)eop_check);
+                       /* the staterr values are not in order, as the count
+                        * count of dd bits doesn't care. However, for end of
+                        * packet tracking, we do care, so shuffle. This also
+                        * compresses the 32-bit values to 8-bit
+                        */
+                       eop_bits = vec_perm(eop_bits, (vector_unsigned_char){},
+                                           eop_shuf_mask);
+                       /* store the resulting 32-bit value */
+                       *split_packet = (vec_xld2(0,
+                                        (unsigned int *)&eop_bits))[0];
+                       split_packet += RTE_I40E_DESCS_PER_LOOP;
+
+                       /* zero-out next pointers */
+                       rx_pkts[pos]->next = NULL;
+                       rx_pkts[pos + 1]->next = NULL;
+                       rx_pkts[pos + 2]->next = NULL;
+                       rx_pkts[pos + 3]->next = NULL;
+               }
+
+               /* C.3 calc available number of desc */
+               staterr = vec_and(staterr, (vector_unsigned_short)dd_check);
+
+               /* D.3 copy final 1,2 data to rx_pkts */
+               vec_xstd2(pkt_mb2, 0, (unsigned char *)&rx_pkts[pos + 1]
+                       ->rx_descriptor_fields1);
+               vec_xstd2(pkt_mb1, 0, (unsigned char *)&rx_pkts[pos]
+                       ->rx_descriptor_fields1);
+
+               desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+               desc_to_olflags_v(descs, &rx_pkts[pos]);
+
+               /* C.4 calc avaialbe number of desc */
+               var = __builtin_popcountll((vec_xld2(0,
+                       (unsigned long long *)&staterr)[0]));
+               nb_pkts_recd += var;
+               if (likely(var != RTE_I40E_DESCS_PER_LOOP))
+                       break;
+       }
+
+       /* Update our internal tail pointer */
+       rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+       rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+       rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+       return nb_pkts_recd;
+}
+
+ /* Notice:
+  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
+  *   numbers of DD bits
+  */
+uint16_t
+i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+                  uint16_t nb_pkts)
+{
+       return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+ /* vPMD receive routine that reassembles scattered packets
+  * Notice:
+  * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
+  * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
+  *   numbers of DD bits
+  */
+uint16_t
+i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+                            uint16_t nb_pkts)
+{
+       struct i40e_rx_queue *rxq = rx_queue;
+       uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
+
+       /* get some new buffers */
+       uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
+                       split_flags);
+       if (nb_bufs == 0)
+               return 0;
+
+       /* happy day case, full burst + no packets to be joined */
+       const uint64_t *split_fl64 = (uint64_t *)split_flags;
+
+       if (rxq->pkt_first_seg == NULL &&
+           split_fl64[0] == 0 && split_fl64[1] == 0 &&
+           split_fl64[2] == 0 && split_fl64[3] == 0)
+               return nb_bufs;
+
+       /* reassemble any packets that need reassembly*/
+       unsigned int i = 0;
+
+       if (!rxq->pkt_first_seg) {
+               /* find the first split flag, and only reassemble then*/
+               while (i < nb_bufs && !split_flags[i])
+                       i++;
+               if (i == nb_bufs)
+                       return nb_bufs;
+       }
+       return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+               &split_flags[i]);
+}
+
+static inline void
+vtx1(volatile struct i40e_tx_desc *txdp,
+       struct rte_mbuf *pkt, uint64_t flags)
+{
+       uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
+               ((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
+               ((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
+
+       vector_unsigned_long_long descriptor = (vector_unsigned_long_long){
+               pkt->buf_iova + pkt->data_off, high_qw};
+       *(vector_unsigned_long_long *)txdp = descriptor;
+}
+
+static inline void
+vtx(volatile struct i40e_tx_desc *txdp,
+       struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+{
+       int i;
+
+       for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
+               vtx1(txdp, *pkt, flags);
+}
+
+uint16_t
+i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+                         uint16_t nb_pkts)
+{
+       struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
+       volatile struct i40e_tx_desc *txdp;
+       struct i40e_tx_entry *txep;
+       uint16_t n, nb_commit, tx_id;
+       uint64_t flags = I40E_TD_CMD;
+       uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
+       int i;
+
+       /* cross rx_thresh boundary is not allowed */
+       nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+       if (txq->nb_tx_free < txq->tx_free_thresh)
+               i40e_tx_free_bufs(txq);
+
+       nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+       nb_commit = nb_pkts;
+       if (unlikely(nb_pkts == 0))
+               return 0;
+
+       tx_id = txq->tx_tail;
+       txdp = &txq->tx_ring[tx_id];
+       txep = &txq->sw_ring[tx_id];
+
+       txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+       n = (uint16_t)(txq->nb_tx_desc - tx_id);
+       if (nb_commit >= n) {
+               tx_backlog_entry(txep, tx_pkts, n);
+
+               for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
+                       vtx1(txdp, *tx_pkts, flags);
+
+               vtx1(txdp, *tx_pkts++, rs);
+
+               nb_commit = (uint16_t)(nb_commit - n);
+
+               tx_id = 0;
+               txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+               /* avoid reach the end of ring */
+               txdp = &txq->tx_ring[tx_id];
+               txep = &txq->sw_ring[tx_id];
+       }
+
+       tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+       vtx(txdp, tx_pkts, nb_commit, flags);
+
+       tx_id = (uint16_t)(tx_id + nb_commit);
+       if (tx_id > txq->tx_next_rs) {
+               txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+                       rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
+                                               I40E_TXD_QW1_CMD_SHIFT);
+               txq->tx_next_rs =
+                       (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+       }
+
+       txq->tx_tail = tx_id;
+
+       I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+       return nb_pkts;
+}
+
+void __attribute__((cold))
+i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq)
+{
+       _i40e_rx_queue_release_mbufs_vec(rxq);
+}
+
+int __attribute__((cold))
+i40e_rxq_vec_setup(struct i40e_rx_queue *rxq)
+{
+       return i40e_rxq_vec_setup_default(rxq);
+}
+
+int __attribute__((cold))
+i40e_txq_vec_setup(struct i40e_tx_queue __rte_unused * txq)
+{
+       return 0;
+}
+
+int __attribute__((cold))
+i40e_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
+{
+       return i40e_rx_vec_dev_conf_condition_check_default(dev);
+}
diff --git a/drivers/net/i40e/meson.build b/drivers/net/i40e/meson.build
index efc5f93e35..88fac6fc2c 100644
--- a/drivers/net/i40e/meson.build
+++ b/drivers/net/i40e/meson.build
@@ -73,6 +73,8 @@ if arch_subdir == 'x86'
     endif
 elif arch_subdir == 'ppc'
        sources += files('i40e_rxtx_vec_altivec.c')
+elif arch_subdir == 's390x'
+       sources += files('i40e_rxtx_vec_s390x.c')
 elif arch_subdir == 'arm'
        sources += files('i40e_rxtx_vec_neon.c')
 endif
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 9e8ea366a5..98d8eb93eb 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -5958,7 +5958,7 @@ ixgbe_config_rss_filter(struct rte_eth_dev *dev,
 }
 
 /* Stubs needed for linkage when RTE_ARCH_PPC_64 is set */
-#if defined(RTE_ARCH_PPC_64)
+#if defined(RTE_ARCH_PPC_64) || defined(RTE_ARCH_S390X)
 int
 ixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev __rte_unused *dev)
 {
diff --git a/drivers/net/memif/rte_eth_memif.h 
b/drivers/net/memif/rte_eth_memif.h
index a5ee23d42e..0270e7859a 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -178,6 +178,8 @@ const char *memif_version(void);
 #define __NR_memfd_create 279
 #elif defined __powerpc__
 #define __NR_memfd_create 360
+#elif defined __s390x__
+#define __NR_memfd_create 350
 #elif defined __i386__
 #define __NR_memfd_create 356
 #else
diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index e5eea0ad94..7618a68c4c 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -209,6 +209,8 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
                snprintf(mode->info, sizeof(mode->info), "%s", "Vector Neon");
 #elif defined RTE_ARCH_PPC_64
                snprintf(mode->info, sizeof(mode->info), "%s", "Vector 
AltiVec");
+#elif defined RTE_ARCH_S390X
+        snprintf(mode->info, sizeof(mode->info), "%s", "Vector S390X");
 #else
                return -EINVAL;
 #endif
@@ -219,6 +221,8 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
                snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector 
Neon");
 #elif defined RTE_ARCH_PPC_64
                snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector 
AltiVec");
+#elif defined RTE_ARCH_S390X
+        snprintf(mode->info, sizeof(mode->info), "%s", "MPRQ Vector S390X");
 #else
                return -EINVAL;
 #endif
@@ -313,12 +317,24 @@ rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cqe,
        uint8_t ptype;
        uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
 
+    /*
+     * hdr_type_etc is from the cqe thus it is BE
+     * the logic below did not convert BE -> LE prior
+     * to using the value of it.  So the logic below
+     * is written for LE thus the value of hdr_type_etc has
+     * to be converted from LE to BE for the logic to work
+    */
+    uint16_t cqe_t_le  = rte_le_to_cpu_16(cqe->hdr_type_etc);
+    uint16_t mcqe_t_le;
+
        /* Get l3/l4 header from mini-CQE in case L3/L4 format*/
        if (mcqe == NULL ||
            rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
-               ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
-       else
-               ptype = mcqe->hdr_type >> 2;
+               ptype = (cqe_t_le & 0xfc00) >> 10;
+       else {
+        mcqe_t_le = rte_le_to_cpu_16(mcqe->hdr_type);
+               ptype = mcqe_t_le >> 2;
+    }
        /*
         * The index to the array should have:
         * bit[1:0] = l3_hdr_type
diff --git a/drivers/net/octeontx/base/octeontx_pki_var.h 
b/drivers/net/octeontx/base/octeontx_pki_var.h
index 4445369ce7..b37d79eb83 100644
--- a/drivers/net/octeontx/base/octeontx_pki_var.h
+++ b/drivers/net/octeontx/base/octeontx_pki_var.h
@@ -157,6 +157,12 @@ typedef union octtx_wqe_s {
                        uint64_t        lbptr : 8;
                        uint64_t        laptr : 8;
                } w4;
+
+               struct {
+                       uint64_t        size  :16;
+                       uint64_t        dwd   : 1;
+                       uint64_t        rsvd0 :47;
+               } w5;
 #endif
        } s;
 
diff --git a/examples/l3fwd-acl/main.c b/examples/l3fwd-acl/main.c
index 2d2ecc7635..e39153f16c 100644
--- a/examples/l3fwd-acl/main.c
+++ b/examples/l3fwd-acl/main.c
@@ -170,6 +170,10 @@ static const struct {
                .name = "altivec",
                .alg = RTE_ACL_CLASSIFY_ALTIVEC,
        },
+    {
+        .name = "s390x",
+        .alg = RTE_ACL_CLASSIFY_S390X,
+    }
        {
                .name = "avx512x16",
                .alg = RTE_ACL_CLASSIFY_AVX512X16,
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 24d0910fe0..dc6c53dc12 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -239,6 +239,14 @@ em_mask_key(void *key, xmm_t mask)
 
        return vec_and(data, mask);
 }
+#elif defined(__s390x__)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+       xmm_t data = (xmm_t) vec_xld2(0, (unsigned int *)(key));
+
+       return data + mask;
+}
 #else
 #error No vector engine (SSE, NEON, ALTIVEC) available, check your toolchain
 #endif
diff --git a/examples/l3fwd/l3fwd_lpm_s390x.h b/examples/l3fwd/l3fwd_lpm_s390x.h
new file mode 100644
index 0000000000..858f696ba9
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_s390x.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation.
+ * (c) Copyright IBM Corp. 2017, 2019
+ */
+#ifndef __L3FWD_LPM_S390X_H__
+#define __L3FWD_LPM_S390X_H__
+
+#include "l3fwd_s390x.h"
+
+typedef unsigned char vector_unsigned_char
+       __attribute__((vector_size(16*sizeof(unsigned char))));
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+               vector_unsigned_int *dip,
+               uint32_t *ipv4_flag)
+{
+       struct ipv4_hdr *ipv4_hdr;
+       struct ether_hdr *eth_hdr;
+       uint32_t x0, x1, x2, x3;
+
+       eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+       x0 = ipv4_hdr->dst_addr;
+       ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+       rte_compiler_barrier();
+       eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+       x1 = ipv4_hdr->dst_addr;
+       ipv4_flag[0] &= pkt[1]->packet_type;
+
+       rte_compiler_barrier();
+       eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+       x2 = ipv4_hdr->dst_addr;
+       ipv4_flag[0] &= pkt[2]->packet_type;
+
+       rte_compiler_barrier();
+       eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+       x3 = ipv4_hdr->dst_addr;
+       ipv4_flag[0] &= pkt[3]->packet_type;
+
+       rte_compiler_barrier();
+       dip[0] = (vector_unsigned_int){x0, x1, x2, x3};
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+               vector_unsigned_int dip,
+               uint32_t ipv4_flag,
+               uint8_t portid,
+               struct rte_mbuf *pkt[FWDSTEP],
+               uint16_t dprt[FWDSTEP])
+{
+       rte_xmm_t dst;
+       const vector_unsigned_char bswap_mask = (vector_unsigned_char){
+                                                       3, 2, 1, 0,
+                                                       7, 6, 5, 4,
+                                                       11, 10, 9, 8,
+                                                       15, 14, 13, 12};
+
+       /* Byte swap 4 IPV4 addresses. */
+       dip = (vector_unsigned_int)vec_perm(*(vector_unsigned_char *)&dip,
+                                       (vector_unsigned_char){}, bswap_mask);
+
+       /* if all 4 packets are IPV4. */
+       if (likely(ipv4_flag)) {
+               rte_lpm_lookupx4(qconf->ipv4_lookup_struct, (xmm_t)dip,
+                       (uint32_t *)&dst, portid);
+               /* get rid of unused upper 16 bit for each dport. */
+               dst.x = (xmm_t)vec_packs(dst.x, dst.x);
+               *(uint64_t *)dprt = dst.u64[0];
+       } else {
+               dst.x = (xmm_t)dip;
+               dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+                                                       dst.u32[0], portid);
+               dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+                                                       dst.u32[1], portid);
+               dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+                                                       dst.u32[2], portid);
+               dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+                                                       dst.u32[3], portid);
+       }
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+                       uint8_t portid, struct lcore_conf *qconf)
+{
+       int32_t j;
+       uint16_t dst_port[MAX_PKT_BURST];
+       vector_unsigned_int dip[MAX_PKT_BURST / FWDSTEP];
+       uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
+       const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+       for (j = 0; j != k; j += FWDSTEP)
+               processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
+                               &ipv4_flag[j / FWDSTEP]);
+
+       for (j = 0; j != k; j += FWDSTEP)
+               processx4_step2(qconf, dip[j / FWDSTEP],
+                               ipv4_flag[j / FWDSTEP],
+                               portid, &pkts_burst[j], &dst_port[j]);
+
+       /* Classify last up to 3 packets one by one */
+       switch (nb_rx % FWDSTEP) {
+       case 3:
+               dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+               j++;
+               /* fall-through */
+       case 2:
+               dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+               j++;
+               /* fall-through */
+       case 1:
+               dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+               j++;
+               /* fall-through */
+       }
+
+       send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_S390X_H__ */
diff --git a/examples/l3fwd/l3fwd_s390x.h b/examples/l3fwd/l3fwd_s390x.h
new file mode 100644
index 0000000000..d027092a49
--- /dev/null
+++ b/examples/l3fwd/l3fwd_s390x.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 Intel Corporation.
+ * (c) Copyright IBM Corp. 2017, 2019
+ */
+#ifndef _L3FWD_S390X_H_
+#define _L3FWD_S390X_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
+typedef unsigned int vector_unsigned_int
+       __attribute__((vector_size(4*sizeof(unsigned int))));
+typedef unsigned short vector_unsigned_short
+       __attribute__((vector_size(8*sizeof(unsigned short))));
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+       vector_unsigned_int te[FWDSTEP];
+       vector_unsigned_int ve[FWDSTEP];
+       vector_unsigned_int *p[FWDSTEP];
+
+       p[0] = rte_pktmbuf_mtod(pkt[0], vector_unsigned_int *);
+       p[1] = rte_pktmbuf_mtod(pkt[1], vector_unsigned_int *);
+       p[2] = rte_pktmbuf_mtod(pkt[2], vector_unsigned_int *);
+       p[3] = rte_pktmbuf_mtod(pkt[3], vector_unsigned_int *);
+
+       ve[0] = (vector_unsigned_int)val_eth[dst_port[0]];
+       te[0] = *p[0];
+
+       ve[1] = (vector_unsigned_int)val_eth[dst_port[1]];
+       te[1] = *p[1];
+
+       ve[2] = (vector_unsigned_int)val_eth[dst_port[2]];
+       te[2] = *p[2];
+
+       ve[3] = (vector_unsigned_int)val_eth[dst_port[3]];
+       te[3] = *p[3];
+
+       /* Update first 12 bytes, keep rest bytes intact. */
+       te[0] = (vector_unsigned_int)vec_sel(
+                       (vector_unsigned_short)ve[0],
+                       (vector_unsigned_short)te[0],
+                       (vector_unsigned_short) {0, 0, 0, 0,
+                                               0, 0, 0xffff, 0xffff});
+
+       te[1] = (vector_unsigned_int)vec_sel(
+                       (vector_unsigned_short)ve[1],
+                       (vector_unsigned_short)te[1],
+                       (vector_unsigned_short) {0, 0, 0, 0,
+                                               0, 0, 0xffff, 0xffff});
+
+       te[2] = (vector_unsigned_int)vec_sel(
+                       (vector_unsigned_short)ve[2],
+                       (vector_unsigned_short)te[2],
+                       (vector_unsigned_short) {0, 0, 0, 0, 0,
+                                               0, 0xffff, 0xffff});
+
+       te[3] = (vector_unsigned_int)vec_sel(
+                       (vector_unsigned_short)ve[3],
+                       (vector_unsigned_short)te[3],
+                       (vector_unsigned_short) {0, 0, 0, 0,
+                                               0, 0, 0xffff, 0xffff});
+
+       *p[0] = te[0];
+       *p[1] = te[1];
+       *p[2] = te[2];
+       *p[3] = te[3];
+
+       rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+               &dst_port[0], pkt[0]->packet_type);
+       rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+               &dst_port[1], pkt[1]->packet_type);
+       rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+               &dst_port[2], pkt[2]->packet_type);
+       rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+               &dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, vector_unsigned_short dp1,
+       vector_unsigned_short dp2)
+{
+       union {
+               uint16_t u16[FWDSTEP + 1];
+               uint64_t u64;
+       } *pnum = (void *)pn;
+
+       int32_t v;
+
+       v = vec_any_eq(dp1, dp2);
+
+
+       /* update last port counter. */
+       lp[0] += gptbl[v].lpv;
+
+       /* if dest port value has changed. */
+       if (v != GRPMSK) {
+               pnum->u64 = gptbl[v].pnum;
+               pnum->u16[FWDSTEP] = 1;
+               lp = pnum->u16 + gptbl[v].idx;
+       }
+
+       return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+       struct ether_hdr *eth_hdr;
+       vector_unsigned_int te, ve;
+
+       eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+       te = *(vector_unsigned_int *)eth_hdr;
+       ve = (vector_unsigned_int)val_eth[dst_port[0]];
+
+       rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+                       pkt->packet_type);
+
+       /* dynamically vec_sel te and ve for MASK_ETH (0x3f) */
+       te = (vector_unsigned_int)vec_sel(
+               (vector_unsigned_short)ve,
+               (vector_unsigned_short)te,
+               (vector_unsigned_short){0, 0, 0, 0,
+                                       0, 0, 0xffff, 0xffff});
+
+       *(vector_unsigned_int *)eth_hdr = te;
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+               uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+       int32_t k;
+       int j = 0;
+       uint16_t dlp;
+       uint16_t *lp;
+       uint16_t pnum[MAX_PKT_BURST + 1];
+
+       /*
+        * Finish packet processing and group consecutive
+        * packets with the same destination port.
+        */
+       k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+       if (k != 0) {
+               vector_unsigned_short dp1, dp2;
+
+               lp = pnum;
+               lp[0] = 1;
+
+               processx4_step3(pkts_burst, dst_port);
+
+               /* dp1: <d[0], d[1], d[2], d[3], ... > */
+               dp1 = *(vector_unsigned_short *)dst_port;
+
+               for (j = FWDSTEP; j != k; j += FWDSTEP) {
+                       processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+                       /*
+                        * dp2:
+                        * <d[j-3], d[j-2], d[j-1], d[j], ... >
+                        */
+                       dp2 = *((vector_unsigned_short *)
+                                       &dst_port[j - FWDSTEP + 1]);
+                       lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+                       /*
+                        * dp1:
+                        * <d[j], d[j+1], d[j+2], d[j+3], ... >
+                        */
+                       dp1 = vec_sro(dp2, (vector unsigned char) {
+                               0, 0, 0, 0, 0, 0, 0, 0,
+                               0, 0, 0, (FWDSTEP - 1) * sizeof(dst_port[0])});
+               }
+
+               /*
+                * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+                */
+               dp2 = vec_perm(dp1, (vector_unsigned_short){},
+                               (vector unsigned char) {0xf9});
+               lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+               /*
+                * remove values added by the last repeated
+                * dst port.
+                */
+               lp[0]--;
+               dlp = dst_port[j - 1];
+       } else {
+               /* set dlp and lp to the never used values. */
+               dlp = BAD_PORT - 1;
+               lp = pnum + MAX_PKT_BURST;
+       }
+
+       /* Process up to last 3 packets one by one. */
+       switch (nb_rx % FWDSTEP) {
+       case 3:
+               process_packet(pkts_burst[j], dst_port + j);
+               GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+               j++;
+               /* fall-through */
+       case 2:
+               process_packet(pkts_burst[j], dst_port + j);
+               GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+               j++;
+               /* fall-through */
+       case 1:
+               process_packet(pkts_burst[j], dst_port + j);
+               GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+               j++;
+       }
+
+       /*
+        * Send packets out, through destination port.
+        * Consecutive packets with the same destination port
+        * are already grouped together.
+        * If destination port for the packet equals BAD_PORT,
+        * then free the packet without sending it out.
+        */
+       for (j = 0; j < nb_rx; j += k) {
+
+               int32_t m;
+               uint16_t pn;
+
+               pn = dst_port[j];
+               k = pnum[j];
+
+               if (likely(pn != BAD_PORT))
+                       send_packetsx4(qconf, pn, pkts_burst + j, k);
+               else
+                       for (m = j; m != j + k; m++)
+                               rte_pktmbuf_free(pkts_burst[m]);
+
+       }
+}
+
+#endif /* _L3FWD_S390X_H_ */
diff --git a/lib/acl/acl_bld.c b/lib/acl/acl_bld.c
index 7ea30f4186..04f5f0a820 100644
--- a/lib/acl/acl_bld.c
+++ b/lib/acl/acl_bld.c
@@ -777,6 +777,9 @@ acl_build_reset(struct rte_acl_ctx *ctx)
                sizeof(*ctx) - offsetof(struct rte_acl_ctx, num_categories));
 }
 
+
+
+
 static void
 acl_gen_full_range(struct acl_build_context *context, struct rte_acl_node 
*root,
        struct rte_acl_node *end, int size, int level)
diff --git a/lib/acl/acl_gen.c b/lib/acl/acl_gen.c
index e759a2ca15..a3c31b0dc9 100644
--- a/lib/acl/acl_gen.c
+++ b/lib/acl/acl_gen.c
@@ -360,7 +360,16 @@ acl_gen_node(struct rte_acl_node *node, uint64_t 
*node_array,
                array_ptr = &node_array[index->quad_index];
                acl_add_ptrs(node, array_ptr, no_match, 0);
                qtrp = (uint32_t *)node->transitions;
+
+               /* Swap qtrp on big endian that transitions[0]
+                * is at least signifcant byte.
+                */
+#if __BYTE_ORDER == __ORDER_BIG_ENDIAN__
+               node->node_index = __bswap_32(qtrp[0]);
+#else
                node->node_index = qtrp[0];
+#endif
+
                node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
                node->node_index |= index->quad_index | node->node_type;
                index->quad_index += node->fanout;
diff --git a/lib/acl/acl_run_scalar.c b/lib/acl/acl_run_scalar.c
index 3d61e79409..9f01ef8d8c 100644
--- a/lib/acl/acl_run_scalar.c
+++ b/lib/acl/acl_run_scalar.c
@@ -141,6 +141,14 @@ rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, 
const uint8_t **data,
                input0 = GET_NEXT_4BYTES(parms, 0);
                input1 = GET_NEXT_4BYTES(parms, 1);
 
+               /* input needs to be swapped because the rules get
+                * swapped while building the trie.
+                */
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+               input0 = __bswap_32(input0);
+               input1 = __bswap_32(input1);
+#endif
+
                for (n = 0; n < 4; n++) {
 
                        transition0 = scalar_transition(flows.trans,
diff --git a/lib/acl/rte_acl.c b/lib/acl/rte_acl.c
index a61c3ba188..ae42ea5b54 100644
--- a/lib/acl/rte_acl.c
+++ b/lib/acl/rte_acl.c
@@ -101,6 +101,8 @@ static const rte_acl_classify_t classify_fns[] = {
        [RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
        [RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
        [RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec,
+       /* use scalar for s390x for now */
+       [RTE_ACL_CLASSIFY_S390X] = rte_acl_classify_scalar,
        [RTE_ACL_CLASSIFY_AVX512X16] = rte_acl_classify_avx512x16,
        [RTE_ACL_CLASSIFY_AVX512X32] = rte_acl_classify_avx512x32,
 };
@@ -145,6 +147,27 @@ acl_check_alg_ppc(enum rte_acl_classify_alg alg)
        return -EINVAL;
 }
 
+
+
+/*
+ * Helper function for acl_check_alg.
+ * Check support for PPC specific classify methods.
+ */
+static int
+acl_check_alg_s390x(enum rte_acl_classify_alg alg)
+{
+    if (alg == RTE_ACL_CLASSIFY_S390X) {
+#if defined(RTE_ARCH_S390X)
+        if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) 
+                       return 0;
+#endif
+        return -ENOTSUP;
+    }
+
+    return -EINVAL;
+}
+
+
 #ifdef CC_AVX512_SUPPORT
 static int
 acl_check_avx512_cpu_flags(void)
@@ -216,6 +239,8 @@ acl_check_alg(enum rte_acl_classify_alg alg)
                return acl_check_alg_arm(alg);
        case RTE_ACL_CLASSIFY_ALTIVEC:
                return acl_check_alg_ppc(alg);
+    case RTE_ACL_CLASSIFY_S390X:
+        return acl_check_alg_s390x(alg);
        case RTE_ACL_CLASSIFY_AVX512X32:
        case RTE_ACL_CLASSIFY_AVX512X16:
        case RTE_ACL_CLASSIFY_AVX2:
@@ -244,6 +269,8 @@ acl_get_best_alg(void)
                RTE_ACL_CLASSIFY_NEON,
 #elif defined(RTE_ARCH_PPC_64)
                RTE_ACL_CLASSIFY_ALTIVEC,
+#elif defined(RTE_ARCH_S390X)
+        RTE_ACL_CLASSIFY_S390X,
 #elif defined(RTE_ARCH_X86)
                RTE_ACL_CLASSIFY_AVX512X32,
                RTE_ACL_CLASSIFY_AVX512X16,
diff --git a/lib/acl/rte_acl.h b/lib/acl/rte_acl.h
index f7f5f08701..307a78ceac 100644
--- a/lib/acl/rte_acl.h
+++ b/lib/acl/rte_acl.h
@@ -241,8 +241,9 @@ enum rte_acl_classify_alg {
        RTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */
        RTE_ACL_CLASSIFY_NEON = 4,    /**< requires NEON support. */
        RTE_ACL_CLASSIFY_ALTIVEC = 5,    /**< requires ALTIVEC support. */
-       RTE_ACL_CLASSIFY_AVX512X16 = 6,  /**< requires AVX512 support. */
-       RTE_ACL_CLASSIFY_AVX512X32 = 7,  /**< requires AVX512 support. */
+    RTE_ACL_CLASSIFY_S390X = 6,    /**< requires s390x z13 support. */
+       RTE_ACL_CLASSIFY_AVX512X16 = 7,  /**< requires AVX512 support. */
+       RTE_ACL_CLASSIFY_AVX512X32 = 8,  /**< requires AVX512 support. */
 };
 
 /**
diff --git a/lib/eal/s390x/include/meson.build 
b/lib/eal/s390x/include/meson.build
new file mode 100644
index 0000000000..b4561d6a82
--- /dev/null
+++ b/lib/eal/s390x/include/meson.build
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# (c) Copyright IBM Corp. 2018, 2019
+
+install_headers(
+       'rte_atomic.h',
+       'rte_byteorder.h',
+       'rte_cpuflags.h',
+       'rte_cycles.h',
+       'rte_io.h',
+       'rte_memcpy.h',
+       'rte_pause.h',
+       'rte_prefetch.h',
+       'rte_rwlock.h',
+       'rte_spinlock.h',
+       'rte_vect.h',
+       subdir: get_option('include_subdir_arch'))
diff --git a/lib/eal/s390x/include/rte_atomic.h 
b/lib/eal/s390x/include/rte_atomic.h
new file mode 100644
index 0000000000..5fce6d5f07
--- /dev/null
+++ b/lib/eal/s390x/include/rte_atomic.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_ATOMIC_S390X_H_
+#define _RTE_ATOMIC_S390X_H_
+
+#ifndef RTE_FORCE_INTRINSICS
+#  error Platform must be built with CONFIG_RTE_FORCE_INTRINSICS
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_atomic.h"
+
+//#define dsb(opt) asm volatile("" : : : "memory")
+//#define dmb(opt) asm volatile("" : : : "memory")
+
+#define rte_mb() rte_compiler_barrier() //asm volatile("" : : : "memory")
+
+#define rte_wmb() rte_mb()
+
+#define rte_rmb() rte_mb()
+
+#define rte_smp_mb() rte_mb()
+
+#define rte_smp_wmb() rte_wmb()
+
+#define rte_smp_rmb() rte_rmb()
+
+#define rte_io_mb() rte_mb()
+
+#define rte_io_wmb() rte_wmb()
+
+#define rte_io_rmb() rte_rmb()
+
+#define rte_cio_wmb() rte_wmb()
+
+#define rte_cio_rmb() rte_rmb()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ATOMIC_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_byteorder.h 
b/lib/eal/s390x/include/rte_byteorder.h
new file mode 100644
index 0000000000..de6e410b4b
--- /dev/null
+++ b/lib/eal/s390x/include/rte_byteorder.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+/* Inspired from FreeBSD src/sys/powerpc/include/endian.h
+ * Copyright (c) 1987, 1991, 1993
+ * The Regents of the University of California.  All rights reserved.
+ */
+
+#ifndef _RTE_BYTEORDER_S390X_H_
+#define _RTE_BYTEORDER_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include "generic/rte_byteorder.h"
+
+/* s390x is big endian
+ */
+
+#define rte_cpu_to_le_16(x) rte_bswap16(x)
+#define rte_cpu_to_le_32(x) rte_bswap32(x)
+#define rte_cpu_to_le_64(x) rte_bswap64(x)
+
+#define rte_cpu_to_be_16(x) (x)
+#define rte_cpu_to_be_32(x) (x)
+#define rte_cpu_to_be_64(x) (x)
+
+#define rte_le_to_cpu_16(x) rte_bswap16(x)
+#define rte_le_to_cpu_32(x) rte_bswap32(x)
+#define rte_le_to_cpu_64(x) rte_bswap64(x)
+
+#define rte_be_to_cpu_16(x) (x)
+#define rte_be_to_cpu_32(x) (x)
+#define rte_be_to_cpu_64(x) (x)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_BYTEORDER_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_cpuflags.h 
b/lib/eal/s390x/include/rte_cpuflags.h
new file mode 100644
index 0000000000..bfeff3f98b
--- /dev/null
+++ b/lib/eal/s390x/include/rte_cpuflags.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_CPUFLAGS_S390X_H_
+#define _RTE_CPUFLAGS_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Enumeration of all CPU features supported
+ */
+enum rte_cpu_flag_t {
+       RTE_CPUFLAG_ESAN3 = 0,
+       RTE_CPUFLAG_ZARCH,
+       RTE_CPUFLAG_STFLE,
+       RTE_CPUFLAG_MSA,
+       RTE_CPUFLAG_LDISP,
+       RTE_CPUFLAG_EIMM,
+       RTE_CPUFLAG_DFP,
+       RTE_CPUFLAG_HPAGE, //from elf.h
+       //RTE_CPUFLAG_EDAT, //from hwcap.h
+       RTE_CPUFLAG_ETF3EH,
+       RTE_CPUFLAG_HIGH_GPRS,
+       RTE_CPUFLAG_TE,
+       RTE_CPUFLAG_VXRS,
+       RTE_CPUFLAG_VXRS_BCD,
+       RTE_CPUFLAG_VXRS_EXT,
+       RTE_CPUFLAG_GS,
+       /* The last item */
+       RTE_CPUFLAG_NUMFLAGS,/**< This should always be the last! */
+};
+
+#include "generic/rte_cpuflags.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_CPUFLAGS_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_cycles.h 
b/lib/eal/s390x/include/rte_cycles.h
new file mode 100644
index 0000000000..7a430e06a8
--- /dev/null
+++ b/lib/eal/s390x/include/rte_cycles.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_CYCLES_S390X_H_
+#define _RTE_CYCLES_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_cycles.h"
+
+#include <rte_common.h>
+
+/**
+ * Read the time base register.
+ *
+ * @return
+ *   The time base for this lcore.
+ */
+static inline uint64_t
+rte_rdtsc(void)
+{
+       uint64_t tsc;
+       asm volatile("stckf %0" : "=Q"(tsc) : : "cc");
+       return tsc;
+}
+
+static inline uint64_t
+rte_rdtsc_precise(void)
+{
+       rte_mb();
+       return rte_rdtsc();
+}
+
+static inline uint64_t
+rte_get_tsc_cycles(void) { return rte_rdtsc(); }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_CYCLES_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_io.h b/lib/eal/s390x/include/rte_io.h
new file mode 100644
index 0000000000..9cb3c1ca7c
--- /dev/null
+++ b/lib/eal/s390x/include/rte_io.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_IO_S390X_H_
+#define _RTE_IO_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#define RTE_OVERRIDE_IO_H
+
+#include "generic/rte_io.h"
+
+#include <unistd.h>
+#include <sys/syscall.h>
+
+union register_pair {
+       __int128_t pair;
+       struct {
+               unsigned long even;
+               unsigned long odd;
+       } even_odd;
+};
+
+/* s390 requires special instructions to access IO memory. */
+static inline uint64_t pcilgi(const volatile void *ioaddr, size_t len)
+{
+        union register_pair ioaddr_len =
+                {.even_odd.even = (uint64_t)ioaddr, .even_odd.odd = len};
+       uint64_t val;
+       int cc = -1;
+
+       asm volatile (
+               "       .insn   rre,0xb9d60000,%[val],%[ioaddr_len]\n"
+               "       ipm     %[cc]\n"
+               "       srl     %[cc],28\n"
+               : [cc] "+d" (cc), [val] "=d" (val),
+                 [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc");
+       return val;
+}
+
+static inline void pcistgi(volatile void *ioaddr, uint64_t val, size_t len)
+{
+        union register_pair ioaddr_len =
+                {.even_odd.even = (uint64_t)ioaddr, .even_odd.odd = len};
+       int cc = -1;
+
+       asm volatile (
+               "       .insn   rre,0xb9d40000,%[val],%[ioaddr_len]\n"
+               "       ipm     %[cc]\n"
+               "       srl     %[cc],28\n"
+               : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+               : [val] "d" (val)
+               : "cc", "memory");
+}
+
+/* fall back to syscall on old machines ? */
+static __rte_always_inline uint8_t
+rte_read8_relaxed(const volatile void *addr)
+{
+       return pcilgi(addr, 1);
+}
+
+static __rte_always_inline uint16_t
+rte_read16_relaxed(const volatile void *addr)
+{
+       return pcilgi(addr, 2);
+}
+
+static __rte_always_inline uint32_t
+rte_read32_relaxed(const volatile void *addr)
+{
+       return pcilgi(addr, 4);
+}
+
+static __rte_always_inline uint64_t
+rte_read64_relaxed(const volatile void *addr)
+{
+       return pcilgi(addr, 8);
+}
+
+static __rte_always_inline void
+rte_write8_relaxed(uint8_t value, volatile void *addr)
+{
+       pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline void
+rte_write16_relaxed(uint16_t value, volatile void *addr)
+{
+       pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline void
+rte_write32_relaxed(uint32_t value, volatile void *addr)
+{
+       pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline void
+rte_write64_relaxed(uint64_t value, volatile void *addr)
+{
+       pcistgi(addr, value, sizeof(value));
+}
+
+static __rte_always_inline uint8_t
+rte_read8(const volatile void *addr)
+{
+       uint8_t val;
+       val = rte_read8_relaxed(addr);
+       rte_io_rmb();
+       return val;
+}
+
+static __rte_always_inline uint16_t
+rte_read16(const volatile void *addr)
+{
+       uint16_t val;
+       val = rte_read16_relaxed(addr);
+       rte_io_rmb();
+       return val;
+}
+
+static __rte_always_inline uint32_t
+rte_read32(const volatile void *addr)
+{
+       uint32_t val;
+       val = rte_read32_relaxed(addr);
+       rte_io_rmb();
+       return val;
+}
+
+static __rte_always_inline uint64_t
+rte_read64(const volatile void *addr)
+{
+       uint64_t val;
+       val = rte_read64_relaxed(addr);
+       rte_io_rmb();
+       return val;
+}
+
+static __rte_always_inline void
+rte_write8(uint8_t value, volatile void *addr)
+{
+       rte_io_wmb();
+       rte_write8_relaxed(value, addr);
+}
+
+static __rte_always_inline void
+rte_write16(uint16_t value, volatile void *addr)
+{
+       rte_io_wmb();
+       rte_write16_relaxed(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32(uint32_t value, volatile void *addr)
+{
+       rte_io_wmb();
+       rte_write32_relaxed(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+    rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write64(uint64_t value, volatile void *addr)
+{
+       rte_io_wmb();
+       rte_write64_relaxed(value, addr);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_IO_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_mcslock.h 
b/lib/eal/s390x/include/rte_mcslock.h
new file mode 100644
index 0000000000..9125237dfd
--- /dev/null
+++ b/lib/eal/s390x/include/rte_mcslock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#ifndef _RTE_MCSLOCK_S390X_H_
+#define _RTE_MCSLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_mcslock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MCSLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_memcpy.h 
b/lib/eal/s390x/include/rte_memcpy.h
new file mode 100644
index 0000000000..1135b1af6f
--- /dev/null
+++ b/lib/eal/s390x/include/rte_memcpy.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_MEMCPY_S390X_H_
+#define _RTE_MEMCPY_S390X_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcpy.h"
+
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 16);
+}
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 32);
+}
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 48);
+}
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 64);
+}
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 128);
+}
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 256);
+}
+#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_pause.h 
b/lib/eal/s390x/include/rte_pause.h
new file mode 100644
index 0000000000..be90ce6a1f
--- /dev/null
+++ b/lib/eal/s390x/include/rte_pause.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_PAUSE_S390X_H_
+#define _RTE_PAUSE_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_pause.h"
+
+static inline void rte_pause(void)
+{
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PAUSE_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_power_intrinsics.h 
b/lib/eal/s390x/include/rte_power_intrinsics.h
new file mode 100644
index 0000000000..c0e9ac279f
--- /dev/null
+++ b/lib/eal/s390x/include/rte_power_intrinsics.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_INTRINSIC_PPC_H_
+#define _RTE_POWER_INTRINSIC_PPC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+
+#include "generic/rte_power_intrinsics.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_POWER_INTRINSIC_PPC_H_ */
diff --git a/lib/eal/s390x/include/rte_prefetch.h 
b/lib/eal/s390x/include/rte_prefetch.h
new file mode 100644
index 0000000000..4a2e73116d
--- /dev/null
+++ b/lib/eal/s390x/include/rte_prefetch.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_PREFETCH_S390X_H_
+#define _RTE_PREFETCH_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include "generic/rte_prefetch.h"
+
+static inline void rte_prefetch0(const volatile void *p)
+{
+       asm volatile ("pfd 1, 0(%[p])" : : [p] "r" (p));
+}
+
+static inline void rte_prefetch1(const volatile void *p)
+{
+       asm volatile ("pfd 1, 0(%[p])" : : [p] "r" (p));
+}
+
+static inline void rte_prefetch2(const volatile void *p)
+{
+       asm volatile ("pfd 1, 0(%[p])" : : [p] "r" (p));
+}
+
+static inline void rte_prefetch_non_temporal(const volatile void *p)
+{
+       /* non-temporal version not available, fallback to rte_prefetch0 */
+       rte_prefetch0(p);
+}
+
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p)
+{
+    RTE_SET_USED(p);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PREFETCH_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_rwlock.h 
b/lib/eal/s390x/include/rte_rwlock.h
new file mode 100644
index 0000000000..f649484f35
--- /dev/null
+++ b/lib/eal/s390x/include/rte_rwlock.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_RWLOCK_S390X_H_
+#define _RTE_RWLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_rwlock.h"
+
+static inline void
+rte_rwlock_read_lock_tm(rte_rwlock_t *rwl)
+{
+       rte_rwlock_read_lock(rwl);
+}
+
+static inline void
+rte_rwlock_read_unlock_tm(rte_rwlock_t *rwl)
+{
+       rte_rwlock_read_unlock(rwl);
+}
+
+static inline void
+rte_rwlock_write_lock_tm(rte_rwlock_t *rwl)
+{
+       rte_rwlock_write_lock(rwl);
+}
+
+static inline void
+rte_rwlock_write_unlock_tm(rte_rwlock_t *rwl)
+{
+       rte_rwlock_write_unlock(rwl);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_RWLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_spinlock.h 
b/lib/eal/s390x/include/rte_spinlock.h
new file mode 100644
index 0000000000..0434864fbc
--- /dev/null
+++ b/lib/eal/s390x/include/rte_spinlock.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_SPINLOCK_S390X_H_
+#define _RTE_SPINLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include "generic/rte_spinlock.h"
+
+#ifndef RTE_FORCE_INTRINSICS
+
+static inline void
+rte_spinlock_lock(rte_spinlock_t *sl)
+{
+       while (__sync_lock_test_and_set(&sl->locked, 1))
+               while (sl->locked)
+                       rte_pause();
+}
+
+static inline void
+rte_spinlock_unlock(rte_spinlock_t *sl)
+{
+       __sync_lock_release(&sl->locked);
+}
+
+static inline int
+rte_spinlock_trylock(rte_spinlock_t *sl)
+{
+       return __sync_lock_test_and_set(&sl->locked, 1) == 0;
+}
+
+#endif
+
+
+static inline int rte_tm_supported(void)
+{
+       return 0;
+}
+
+static inline void
+rte_spinlock_lock_tm(rte_spinlock_t *sl)
+{
+       rte_spinlock_lock(sl); /* fall-back */
+}
+
+static inline int
+rte_spinlock_trylock_tm(rte_spinlock_t *sl)
+{
+       return rte_spinlock_trylock(sl);
+}
+
+static inline void
+rte_spinlock_unlock_tm(rte_spinlock_t *sl)
+{
+       rte_spinlock_unlock(sl);
+}
+
+static inline void
+rte_spinlock_recursive_lock_tm(rte_spinlock_recursive_t *slr)
+{
+       rte_spinlock_recursive_lock(slr); /* fall-back */
+}
+
+static inline void
+rte_spinlock_recursive_unlock_tm(rte_spinlock_recursive_t *slr)
+{
+       rte_spinlock_recursive_unlock(slr);
+}
+
+static inline int
+rte_spinlock_recursive_trylock_tm(rte_spinlock_recursive_t *slr)
+{
+       return rte_spinlock_recursive_trylock(slr);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_SPINLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_ticketlock.h 
b/lib/eal/s390x/include/rte_ticketlock.h
new file mode 100644
index 0000000000..0785363c94
--- /dev/null
+++ b/lib/eal/s390x/include/rte_ticketlock.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2019
+ */
+
+#ifndef _RTE_TICKETLOCK_S390X_H_
+#define _RTE_TICKETLOCK_S390X_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_ticketlock.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_TICKETLOCK_S390X_H_ */
diff --git a/lib/eal/s390x/include/rte_vect.h b/lib/eal/s390x/include/rte_vect.h
new file mode 100644
index 0000000000..8fe3535965
--- /dev/null
+++ b/lib/eal/s390x/include/rte_vect.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#ifndef _RTE_VECT_S390X_H_
+#define _RTE_VECT_S390X_H_
+
+#include <vecintrin.h>
+#include "generic/rte_vect.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_256
+
+typedef int xmm_t __attribute__((vector_size(4*sizeof(int))));
+
+#define        XMM_SIZE        (sizeof(xmm_t))
+#define        XMM_MASK        (XMM_SIZE - 1)
+
+typedef union rte_xmm {
+       xmm_t    x;
+       uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
+       uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+       uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+       uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+       double   pd[XMM_SIZE / sizeof(double)];
+} __attribute__((aligned(16))) rte_xmm_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_VECT_S390X_H_ */
diff --git a/lib/eal/s390x/meson.build b/lib/eal/s390x/meson.build
new file mode 100644
index 0000000000..c8cc8d1f3d
--- /dev/null
+++ b/lib/eal/s390x/meson.build
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# (c) Copyright IBM Corp. 2018, 2019
+
+subdir('include')
+
+# 19.xx zarch patches lib/librte_eal/common/arch/s390x/meson.build:
+# var was: eal_common_arch_sources
+#
+sources += files(
+        'rte_cpuflags.c',
+        'rte_cycles.c',
+        'rte_hypervisor.c',
+        'rte_power_intrinsics.c',
+)
+
+
diff --git a/lib/eal/s390x/rte_cpuflags.c b/lib/eal/s390x/rte_cpuflags.c
new file mode 100644
index 0000000000..d57a51d267
--- /dev/null
+++ b/lib/eal/s390x/rte_cpuflags.c
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#include "rte_cpuflags.h"
+
+#include <elf.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <unistd.h>
+#include <string.h>
+
+/* Symbolic values for the entries in the auxiliary table */
+#define AT_HWCAP  16
+#define AT_HWCAP2 26
+
+/* software based registers */
+enum cpu_register_t {
+       REG_NONE = 0,
+       REG_HWCAP,
+       REG_HWCAP2,
+       REG_MAX
+};
+
+typedef uint32_t hwcap_registers_t[REG_MAX];
+
+struct feature_entry {
+       uint32_t reg;
+       uint32_t bit;
+#define CPU_FLAG_NAME_MAX_LEN 64
+       char name[CPU_FLAG_NAME_MAX_LEN];
+};
+
+#define FEAT_DEF(name, reg, bit) \
+       [RTE_CPUFLAG_##name] = {reg, bit, #name},
+
+const struct feature_entry rte_cpu_feature_table[] = {
+       FEAT_DEF(ESAN3,                  REG_HWCAP,   0)
+       FEAT_DEF(ZARCH,                  REG_HWCAP,   1)
+       FEAT_DEF(STFLE,                  REG_HWCAP,   2)
+       FEAT_DEF(MSA,                    REG_HWCAP,   3)
+       FEAT_DEF(LDISP,                  REG_HWCAP,   4)
+       FEAT_DEF(EIMM,                   REG_HWCAP,   5)
+       FEAT_DEF(DFP,                    REG_HWCAP,   6)
+       FEAT_DEF(HPAGE,                  REG_HWCAP,   7)
+       FEAT_DEF(ETF3EH,                 REG_HWCAP,   8)
+       FEAT_DEF(HIGH_GPRS,              REG_HWCAP,   9)
+       FEAT_DEF(TE,                     REG_HWCAP,  10)
+       FEAT_DEF(VXRS,                   REG_HWCAP,  11)
+       FEAT_DEF(VXRS_BCD,               REG_HWCAP,  12)
+       FEAT_DEF(VXRS_EXT,               REG_HWCAP,  13)
+       FEAT_DEF(GS,                     REG_HWCAP,  14)
+};
+
+/*
+ * Read AUXV software register and get cpu features for Power
+ */
+static void
+rte_cpu_get_features(hwcap_registers_t out)
+{
+       out[REG_HWCAP] = rte_cpu_getauxval(AT_HWCAP);
+       out[REG_HWCAP2] = rte_cpu_getauxval(AT_HWCAP2);
+}
+
+/*
+ * Checks if a particular flag is available on current machine.
+ */
+int
+rte_cpu_get_flag_enabled(enum rte_cpu_flag_t feature)
+{
+       const struct feature_entry *feat;
+       hwcap_registers_t regs = {0};
+
+       if (feature >= RTE_CPUFLAG_NUMFLAGS)
+               return -ENOENT;
+
+       feat = &rte_cpu_feature_table[feature];
+       if (feat->reg == REG_NONE)
+               return -EFAULT;
+
+       rte_cpu_get_features(regs);
+       return (regs[feat->reg] >> feat->bit) & 1;
+}
+
+const char *
+rte_cpu_get_flag_name(enum rte_cpu_flag_t feature)
+{
+       if (feature >= RTE_CPUFLAG_NUMFLAGS)
+               return NULL;
+       return rte_cpu_feature_table[feature].name;
+}
diff --git a/lib/eal/s390x/rte_cycles.c b/lib/eal/s390x/rte_cycles.c
new file mode 100644
index 0000000000..b29c4454a1
--- /dev/null
+++ b/lib/eal/s390x/rte_cycles.c
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#include "eal_private.h"
+
+uint64_t
+get_tsc_freq_arch(void)
+{
+       return 0;
+}
diff --git a/lib/eal/s390x/rte_hypervisor.c b/lib/eal/s390x/rte_hypervisor.c
new file mode 100644
index 0000000000..22b0c5cc47
--- /dev/null
+++ b/lib/eal/s390x/rte_hypervisor.c
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2018, 2019
+ */
+
+#include "rte_hypervisor.h"
+
+enum rte_hypervisor
+rte_hypervisor_get(void)
+{
+       return RTE_HYPERVISOR_UNKNOWN;
+}
diff --git a/lib/eal/s390x/rte_power_intrinsics.c 
b/lib/eal/s390x/rte_power_intrinsics.c
new file mode 100644
index 0000000000..f00b58ade5
--- /dev/null
+++ b/lib/eal/s390x/rte_power_intrinsics.c
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include "rte_power_intrinsics.h"
+
+/**
+ * This function is not supported on PPC64.
+ */
+int
+rte_power_monitor(const struct rte_power_monitor_cond *pmc,
+               const uint64_t tsc_timestamp)
+{
+       RTE_SET_USED(pmc);
+       RTE_SET_USED(tsc_timestamp);
+
+       return -ENOTSUP;
+}
+
+/**
+ * This function is not supported on PPC64.
+ */
+int
+rte_power_pause(const uint64_t tsc_timestamp)
+{
+       RTE_SET_USED(tsc_timestamp);
+
+       return -ENOTSUP;
+}
+
+/**
+ * This function is not supported on PPC64.
+ */
+int
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+       RTE_SET_USED(lcore_id);
+
+       return -ENOTSUP;
+}
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+               const uint32_t num, const uint64_t tsc_timestamp)
+{
+       RTE_SET_USED(pmc);
+       RTE_SET_USED(num);
+       RTE_SET_USED(tsc_timestamp);
+
+       return -ENOTSUP;
+}
diff --git a/lib/hash/rte_fbk_hash.h b/lib/hash/rte_fbk_hash.h
index b01126999b..956d3f90f9 100644
--- a/lib/hash/rte_fbk_hash.h
+++ b/lib/hash/rte_fbk_hash.h
@@ -123,9 +123,16 @@ rte_fbk_hash_add_key_with_bucket(struct rte_fbk_hash_table 
*ht,
         * corrupted due to race conditions, but it's still possible to
         * overwrite entries that have just been made valid.
         */
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
        const uint64_t new_entry = ((uint64_t)(key) << 32) |
                        ((uint64_t)(value) << 16) |
                        1;  /* 1 = is_entry bit. */
+       #else
+       const uint64_t new_entry =
+                       ((uint64_t)(1) << 48) | /* 1 = is_entry bit. */
+                       ((uint64_t)(value) << 32) |
+                       (uint64_t)(key);
+       #endif
        uint32_t i;
 
        for (i = 0; i < ht->entries_per_bucket; i++) {
diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build
index 78d91d3421..20f76368fa 100644
--- a/lib/lpm/meson.build
+++ b/lib/lpm/meson.build
@@ -13,6 +13,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h')
 # without worrying about which architecture we actually need
 indirect_headers += files(
         'rte_lpm_altivec.h',
+        'rte_lpm_s390x.h',
         'rte_lpm_neon.h',
         'rte_lpm_sse.h',
         'rte_lpm_sve.h',
diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h
index eb91960e81..b9ee616c1d 100644
--- a/lib/lpm/rte_lpm.h
+++ b/lib/lpm/rte_lpm.h
@@ -405,6 +405,8 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, 
uint32_t hop[4],
 #endif
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
+#elif defined(RTE_ARCH_S390X)
+#include "rte_lpm_s390x.h"
 #else
 #include "rte_lpm_sse.h"
 #endif
diff --git a/lib/lpm/rte_lpm6.c b/lib/lpm/rte_lpm6.c
index 8d21aeddb8..4a0f5740a2 100644
--- a/lib/lpm/rte_lpm6.c
+++ b/lib/lpm/rte_lpm6.c
@@ -18,6 +18,7 @@
 #include <assert.h>
 #include <rte_jhash.h>
 #include <rte_tailq.h>
+#include <rte_byteorder.h>
 
 #include "rte_lpm6.h"
 
@@ -52,6 +53,8 @@ static struct rte_tailq_elem rte_lpm6_tailq = {
 };
 EAL_REGISTER_TAILQ(rte_lpm6_tailq)
 
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+
 /** Tbl entry structure. It is the same for both tbl24 and tbl8 */
 struct rte_lpm6_tbl_entry {
        uint32_t next_hop:      21;  /**< Next hop / next table to be checked. 
*/
@@ -63,6 +66,21 @@ struct rte_lpm6_tbl_entry {
        uint32_t ext_entry :1;   /**< External entry. */
 };
 
+#else
+
+struct rte_lpm6_tbl_entry {
+
+       /* Flags. */
+       uint32_t ext_entry :1;   /**< External entry. */
+       uint32_t valid_group :1; /**< Group validation flag. */
+       uint32_t valid     :1;   /**< Validation flag. */
+
+       uint32_t depth  :8;      /**< Rule depth. */
+       uint32_t next_hop:      21;  /**< Next hop / next table to be checked. 
*/
+};
+
+#endif
+
 /** Rules tbl entry structure. */
 struct rte_lpm6_rule {
        uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */
diff --git a/lib/lpm/rte_lpm_s390x.h b/lib/lpm/rte_lpm_s390x.h
new file mode 100644
index 0000000000..eb1fdd4509
--- /dev/null
+++ b/lib/lpm/rte_lpm_s390x.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * (c) Copyright IBM Corp. 2016, 2018
+ */
+
+#ifndef _RTE_LPM_S390X_H_
+#define _RTE_LPM_S390X_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+       uint32_t defv)
+{
+       typedef int vector_signed_int
+               __attribute__((vector_size(4*sizeof(int))));
+       vector_signed_int i24;
+       rte_xmm_t i8;
+       uint32_t tbl[4];
+       uint64_t idx, pt, pt2;
+       const uint32_t *ptbl;
+
+       const uint32_t mask = UINT8_MAX;
+       const vector_signed_int mask8 = (xmm_t){mask, mask, mask, mask};
+
+       /*
+        * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
+        * as one 64-bit value (0x0300000003000000).
+        */
+       const uint64_t mask_xv =
+               ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
+
+       /*
+        * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
+        * as one 64-bit value (0x0100000001000000).
+        */
+       const uint64_t mask_v =
+               ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
+
+       /* get 4 indexes for tbl24[]. */
+       i24[0] = (uint32_t)ip[0] >> 8;
+       i24[1] = (uint32_t)ip[1] >> 8;
+       i24[2] = (uint32_t)ip[2] >> 8;
+       i24[3] = (uint32_t)ip[3] >> 8;
+
+       /* extract values from tbl24[] */
+       idx = (uint32_t)i24[0];
+       idx = idx < (1<<24) ? idx : (1<<24)-1;
+       ptbl = (const uint32_t *)&lpm->tbl24[idx];
+       tbl[0] = *ptbl;
+
+       idx = (uint32_t) i24[1];
+       idx = idx < (1<<24) ? idx : (1<<24)-1;
+       ptbl = (const uint32_t *)&lpm->tbl24[idx];
+       tbl[1] = *ptbl;
+
+       idx = (uint32_t) i24[2];
+       idx = idx < (1<<24) ? idx : (1<<24)-1;
+       ptbl = (const uint32_t *)&lpm->tbl24[idx];
+       tbl[2] = *ptbl;
+
+       idx = (uint32_t) i24[3];
+       idx = idx < (1<<24) ? idx : (1<<24)-1;
+       ptbl = (const uint32_t *)&lpm->tbl24[idx];
+       tbl[3] = *ptbl;
+
+       /* get 4 indexes for tbl8[]. */
+       i8.x = vec_and(ip, mask8);
+
+       pt = (uint64_t)tbl[0] |
+               (uint64_t)tbl[1] << 32;
+       pt2 = (uint64_t)tbl[2] |
+               (uint64_t)tbl[3] << 32;
+
+       /* search successfully finished for all 4 IP addresses. */
+       if (likely((pt & mask_xv) == mask_v) &&
+                       likely((pt2 & mask_xv) == mask_v)) {
+               *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
+               *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
+               return;
+       }
+
+       if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+               i8.u32[0] = i8.u32[0] +
+                       (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+               ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
+               tbl[0] = *ptbl;
+       }
+       if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+               i8.u32[1] = i8.u32[1] +
+                       (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+               ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
+               tbl[1] = *ptbl;
+       }
+       if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+               i8.u32[2] = i8.u32[2] +
+                       (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+               ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
+               tbl[2] = *ptbl;
+       }
+       if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+               i8.u32[3] = i8.u32[3] +
+                       (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+               ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
+               tbl[3] = *ptbl;
+       }
+
+       hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv;
+       hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv;
+       hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv;
+       hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_S390X_H_ */
diff --git a/meson.build b/meson.build
index 937f6110c0..8c8d673609 100644
--- a/meson.build
+++ b/meson.build
@@ -50,6 +50,8 @@ elif host_machine.cpu_family().startswith('arm') or 
host_machine.cpu_family().st
     arch_subdir = 'arm'
 elif host_machine.cpu_family().startswith('ppc')
     arch_subdir = 'ppc'
+elif host_machine.cpu_family().startswith('s390x')
+       arch_subdir = 's390x'
 endif
 
 # configure the build, and make sure configs here and in config folder are
-- 
2.37.2


Reply via email to