loongarch: add dummy vector memcpy for LoongArch

Min Zhou Mon, 06 Jun 2022 06:11:18 -0700

The hardware instructions based vector implementation for memcpy
will come later. At present, this dummy implementation can also
work.


Signed-off-by: Min Zhou <zhou...@loongson.cn>
---
 lib/eal/loongarch/include/rte_memcpy.h | 193 +++++++++++++++++++++++++
 lib/eal/loongarch/include/rte_vect.h   |  46 ++++++
 2 files changed, 239 insertions(+)
 create mode 100644 lib/eal/loongarch/include/rte_memcpy.h
 create mode 100644 lib/eal/loongarch/include/rte_vect.h

diff --git a/lib/eal/loongarch/include/rte_memcpy.h 
b/lib/eal/loongarch/include/rte_memcpy.h
new file mode 100644
index 0000000000..98dc3dfc3b
--- /dev/null
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _RTE_MEMCPY_LOONGARCH_H_
+#define _RTE_MEMCPY_LOONGARCH_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcpy.h"
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+       *(xmm_t *)dst = *(const xmm_t *)src;
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+       rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+       rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+       rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+       rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+       rte_mov16((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+       rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+       rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+       rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+       rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+       rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+       rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+       rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+       rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+       rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+       rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+       rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+       rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+       rte_mov128(dst, src);
+       rte_mov128(dst + 128, src + 128);
+}
+
+#define rte_memcpy(dst, src, n)      \
+       rte_memcpy_func((dst), (src), (n))
+
+static inline void *
+rte_memcpy_func(void *dst, const void *src, size_t n)
+{
+       void *ret = dst;
+
+       /* We can't copy < 16 bytes using XMM registers so do it manually. */
+       if (n < 16) {
+               if (n & 0x01) {
+                       *(uint8_t *)dst = *(const uint8_t *)src;
+                       dst = (uint8_t *)dst + 1;
+                       src = (const uint8_t *)src + 1;
+               }
+               if (n & 0x02) {
+                       *(uint16_t *)dst = *(const uint16_t *)src;
+                       dst = (uint16_t *)dst + 1;
+                       src = (const uint16_t *)src + 1;
+               }
+               if (n & 0x04) {
+                       *(uint32_t *)dst = *(const uint32_t *)src;
+                       dst = (uint32_t *)dst + 1;
+                       src = (const uint32_t *)src + 1;
+               }
+               if (n & 0x08)
+                       *(uint64_t *)dst = *(const uint64_t *)src;
+               return ret;
+       }
+
+       /* Special fast cases for <= 128 bytes */
+       if (n <= 32) {
+               rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+               rte_mov16((uint8_t *)dst - 16 + n,
+                       (const uint8_t *)src - 16 + n);
+               return ret;
+       }
+
+       if (n <= 64) {
+               rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+               rte_mov32((uint8_t *)dst - 32 + n,
+                       (const uint8_t *)src - 32 + n);
+               return ret;
+       }
+
+       if (n <= 128) {
+               rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+               rte_mov64((uint8_t *)dst - 64 + n,
+                       (const uint8_t *)src - 64 + n);
+               return ret;
+       }
+
+       /*
+        * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
+        * copies was found to be faster than doing 128 and 32 byte copies as
+        * well.
+        */
+       for ( ; n >= 256; n -= 256) {
+               rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+               dst = (uint8_t *)dst + 256;
+               src = (const uint8_t *)src + 256;
+       }
+
+       /*
+        * We split the remaining bytes (which will be less than 256) into
+        * 64byte (2^6) chunks.
+        * Using incrementing integers in the case labels of a switch statement
+        * encourages the compiler to use a jump table. To get incrementing
+        * integers, we shift the 2 relevant bits to the LSB position to first
+        * get decrementing integers, and then subtract.
+        */
+       switch (3 - (n >> 6)) {
+       case 0x00:
+               rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+               n -= 64;
+               dst = (uint8_t *)dst + 64;
+               src = (const uint8_t *)src + 64;      /* fallthrough */
+       case 0x01:
+               rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+               n -= 64;
+               dst = (uint8_t *)dst + 64;
+               src = (const uint8_t *)src + 64;      /* fallthrough */
+       case 0x02:
+               rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+               n -= 64;
+               dst = (uint8_t *)dst + 64;
+               src = (const uint8_t *)src + 64;      /* fallthrough */
+       default:
+               break;
+       }
+
+       /*
+        * We split the remaining bytes (which will be less than 64) into
+        * 16byte (2^4) chunks, using the same switch structure as above.
+        */
+       switch (3 - (n >> 4)) {
+       case 0x00:
+               rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+               n -= 16;
+               dst = (uint8_t *)dst + 16;
+               src = (const uint8_t *)src + 16;      /* fallthrough */
+       case 0x01:
+               rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+               n -= 16;
+               dst = (uint8_t *)dst + 16;
+               src = (const uint8_t *)src + 16;      /* fallthrough */
+       case 0x02:
+               rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+               n -= 16;
+               dst = (uint8_t *)dst + 16;
+               src = (const uint8_t *)src + 16;      /* fallthrough */
+       default:
+               break;
+       }
+
+       /* Copy any remaining bytes, without going beyond end of buffers */
+       if (n != 0)
+               rte_mov16((uint8_t *)dst - 16 + n,
+                       (const uint8_t *)src - 16 + n);
+       return ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_LOONGARCH_H_ */
diff --git a/lib/eal/loongarch/include/rte_vect.h 
b/lib/eal/loongarch/include/rte_vect.h
new file mode 100644
index 0000000000..3e96fdd958
--- /dev/null
+++ b/lib/eal/loongarch/include/rte_vect.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _RTE_VECT_LOONGARCH_H_
+#define _RTE_VECT_LOONGARCH_H_
+
+#include <stdint.h>
+#include "rte_common.h"
+#include "generic/rte_vect.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_256
+
+typedef union xmm {
+       int8_t   i8[16];
+       int16_t  i16[8];
+       int32_t  i32[4];
+       int64_t  i64[2];
+       uint8_t  u8[16];
+       uint16_t u16[8];
+       uint32_t u32[4];
+       uint64_t u64[2];
+       double   pd[2];
+} __rte_aligned(16) xmm_t;
+
+#define XMM_SIZE        (sizeof(xmm_t))
+#define XMM_MASK        (XMM_SIZE - 1)
+
+typedef union rte_xmm {
+       xmm_t x;
+       uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
+       uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+       uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+       uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+       double   pd[XMM_SIZE / sizeof(double)];
+} __rte_aligned(16) rte_xmm_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
-- 
2.31.1

[v3 07/24] eal/loongarch: add dummy vector memcpy for LoongArch

Reply via email to