loongarch: Implement xvshuf xvperm{i} xvshuf4i xvextrins

Song Gao Fri, 30 Jun 2023 01:12:34 -0700

This patch includes:
- XVSHUF.{B/H/W/D};
- XVPERM.W;
- XVSHUF4i.{B/H/W/D};
- XVPERMI.{W/D/Q};
- XVEXTRINS.{B/H/W/D}.


Signed-off-by: Song Gao <gaos...@loongson.cn>
---
 target/loongarch/disas.c                     |  21 +++
 target/loongarch/helper.h                    |  33 ++--
 target/loongarch/insn_trans/trans_lasx.c.inc |  21 +++
 target/loongarch/insns.decode                |  21 +++
 target/loongarch/vec.h                       |   2 +
 target/loongarch/vec_helper.c                | 159 ++++++++++++++-----
 6 files changed, 200 insertions(+), 57 deletions(-)

diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 9b6a07bbb0..a518c59772 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -2575,3 +2575,24 @@ INSN_LASX(xvilvh_b,          vvv)
 INSN_LASX(xvilvh_h,          vvv)
 INSN_LASX(xvilvh_w,          vvv)
 INSN_LASX(xvilvh_d,          vvv)
+
+INSN_LASX(xvshuf_b,          vvvv)
+INSN_LASX(xvshuf_h,          vvv)
+INSN_LASX(xvshuf_w,          vvv)
+INSN_LASX(xvshuf_d,          vvv)
+
+INSN_LASX(xvperm_w,          vvv)
+
+INSN_LASX(xvshuf4i_b,        vv_i)
+INSN_LASX(xvshuf4i_h,        vv_i)
+INSN_LASX(xvshuf4i_w,        vv_i)
+INSN_LASX(xvshuf4i_d,        vv_i)
+
+INSN_LASX(xvpermi_w,         vv_i)
+INSN_LASX(xvpermi_d,         vv_i)
+INSN_LASX(xvpermi_q,         vv_i)
+
+INSN_LASX(xvextrins_d,       vv_i)
+INSN_LASX(xvextrins_w,       vv_i)
+INSN_LASX(xvextrins_h,       vv_i)
+INSN_LASX(xvextrins_b,       vv_i)
diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index ce6dc97500..3c969c9c9b 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -699,18 +699,21 @@ DEF_HELPER_5(vilvh_h, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(vilvh_w, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(vilvh_d, void, env, i32, i32, i32, i32)
 
-DEF_HELPER_5(vshuf_b, void, env, i32, i32, i32, i32)
-DEF_HELPER_4(vshuf_h, void, env, i32, i32, i32)
-DEF_HELPER_4(vshuf_w, void, env, i32, i32, i32)
-DEF_HELPER_4(vshuf_d, void, env, i32, i32, i32)
-DEF_HELPER_4(vshuf4i_b, void, env, i32, i32, i32)
-DEF_HELPER_4(vshuf4i_h, void, env, i32, i32, i32)
-DEF_HELPER_4(vshuf4i_w, void, env, i32, i32, i32)
-DEF_HELPER_4(vshuf4i_d, void, env, i32, i32, i32)
-
-DEF_HELPER_4(vpermi_w, void, env, i32, i32, i32)
-
-DEF_HELPER_4(vextrins_b, void, env, i32, i32, i32)
-DEF_HELPER_4(vextrins_h, void, env, i32, i32, i32)
-DEF_HELPER_4(vextrins_w, void, env, i32, i32, i32)
-DEF_HELPER_4(vextrins_d, void, env, i32, i32, i32)
+DEF_HELPER_6(vshuf_b, void, env, i32, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf_h, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf_w, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf_d, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf4i_b, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf4i_h, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf4i_w, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vshuf4i_d, void, env, i32, i32, i32, i32)
+
+DEF_HELPER_5(vperm_w, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vpermi_w, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vpermi_d, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vpermi_q, void, env, i32, i32, i32, i32)
+
+DEF_HELPER_5(vextrins_b, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vextrins_h, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vextrins_w, void, env, i32, i32, i32, i32)
+DEF_HELPER_5(vextrins_d, void, env, i32, i32, i32, i32)
diff --git a/target/loongarch/insn_trans/trans_lasx.c.inc 
b/target/loongarch/insn_trans/trans_lasx.c.inc
index c059e2fdcc..b8d9b42070 100644
--- a/target/loongarch/insn_trans/trans_lasx.c.inc
+++ b/target/loongarch/insn_trans/trans_lasx.c.inc
@@ -962,3 +962,24 @@ TRANS(xvilvh_b, gen_vvv, 32, gen_helper_vilvh_b)
 TRANS(xvilvh_h, gen_vvv, 32, gen_helper_vilvh_h)
 TRANS(xvilvh_w, gen_vvv, 32, gen_helper_vilvh_w)
 TRANS(xvilvh_d, gen_vvv, 32, gen_helper_vilvh_d)
+
+TRANS(xvshuf_b, gen_vvvv, 32, gen_helper_vshuf_b)
+TRANS(xvshuf_h, gen_vvv, 32, gen_helper_vshuf_h)
+TRANS(xvshuf_w, gen_vvv, 32, gen_helper_vshuf_w)
+TRANS(xvshuf_d, gen_vvv, 32, gen_helper_vshuf_d)
+
+TRANS(xvperm_w, gen_vvv, 32,  gen_helper_vperm_w)
+
+TRANS(xvshuf4i_b, gen_vv_i, 32, gen_helper_vshuf4i_b)
+TRANS(xvshuf4i_h, gen_vv_i, 32, gen_helper_vshuf4i_h)
+TRANS(xvshuf4i_w, gen_vv_i, 32, gen_helper_vshuf4i_w)
+TRANS(xvshuf4i_d, gen_vv_i, 32, gen_helper_vshuf4i_d)
+
+TRANS(xvpermi_w, gen_vv_i, 32, gen_helper_vpermi_w)
+TRANS(xvpermi_d, gen_vv_i, 32, gen_helper_vpermi_d)
+TRANS(xvpermi_q, gen_vv_i, 32, gen_helper_vpermi_q)
+
+TRANS(xvextrins_b, gen_vv_i, 32, gen_helper_vextrins_b)
+TRANS(xvextrins_h, gen_vv_i, 32, gen_helper_vextrins_h)
+TRANS(xvextrins_w, gen_vv_i, 32, gen_helper_vextrins_w)
+TRANS(xvextrins_d, gen_vv_i, 32, gen_helper_vextrins_d)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index a325b861c1..64b67ee9ac 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -2039,3 +2039,24 @@ xvilvh_b         0111 01010001 11000 ..... ..... .....   
 @vvv
 xvilvh_h         0111 01010001 11001 ..... ..... .....    @vvv
 xvilvh_w         0111 01010001 11010 ..... ..... .....    @vvv
 xvilvh_d         0111 01010001 11011 ..... ..... .....    @vvv
+
+xvshuf_b         0000 11010110 ..... ..... ..... .....    @vvvv
+xvshuf_h         0111 01010111 10101 ..... ..... .....    @vvv
+xvshuf_w         0111 01010111 10110 ..... ..... .....    @vvv
+xvshuf_d         0111 01010111 10111 ..... ..... .....    @vvv
+
+xvperm_w         0111 01010111 11010 ..... ..... .....    @vvv
+
+xvshuf4i_b       0111 01111001 00 ........ ..... .....    @vv_ui8
+xvshuf4i_h       0111 01111001 01 ........ ..... .....    @vv_ui8
+xvshuf4i_w       0111 01111001 10 ........ ..... .....    @vv_ui8
+xvshuf4i_d       0111 01111001 11 ........ ..... .....    @vv_ui8
+
+xvpermi_w        0111 01111110 01 ........ ..... .....    @vv_ui8
+xvpermi_d        0111 01111110 10 ........ ..... .....    @vv_ui8
+xvpermi_q        0111 01111110 11 ........ ..... .....    @vv_ui8
+
+xvextrins_d      0111 01111000 00 ........ ..... .....    @vv_ui8
+xvextrins_w      0111 01111000 01 ........ ..... .....    @vv_ui8
+xvextrins_h      0111 01111000 10 ........ ..... .....    @vv_ui8
+xvextrins_b      0111 01111000 11 ........ ..... .....    @vv_ui8
diff --git a/target/loongarch/vec.h b/target/loongarch/vec.h
index 06cc5331a3..c41bdd42fa 100644
--- a/target/loongarch/vec.h
+++ b/target/loongarch/vec.h
@@ -93,4 +93,6 @@
 #define VSLE(a, b) (a <= b ? -1 : 0)
 #define VSLT(a, b) (a < b ? -1 : 0)
 
+#define SHF_POS(i, imm) (((i) & 0xfc) + (((imm) >> (2 * ((i) & 0x03))) & 0x03))
+
 #endif /* LOONGARCH_VEC_H */
diff --git a/target/loongarch/vec_helper.c b/target/loongarch/vec_helper.c
index d641c718f6..65e83062c1 100644
--- a/target/loongarch/vec_helper.c
+++ b/target/loongarch/vec_helper.c
@@ -3426,7 +3426,7 @@ VILVH(vilvh_h, 32, H)
 VILVH(vilvh_w, 64, W)
 VILVH(vilvh_d, 128, D)
 
-void HELPER(vshuf_b)(CPULoongArchState *env,
+void HELPER(vshuf_b)(CPULoongArchState *env, uint32_t oprsz,
                      uint32_t vd, uint32_t vj, uint32_t vk, uint32_t va)
 {
     int i, m;
@@ -3436,93 +3436,168 @@ void HELPER(vshuf_b)(CPULoongArchState *env,
     VReg *Vk = &(env->fpr[vk].vreg);
     VReg *Va = &(env->fpr[va].vreg);
 
-    m = LSX_LEN/8;
-    for (i = 0; i < m ; i++) {
+    m = LSX_LEN / 8;
+    for (i = 0; i < m; i++) {
         uint64_t k = (uint8_t)Va->B(i) % (2 * m);
         temp.B(i) = k < m ? Vk->B(k) : Vj->B(k - m);
     }
-    *Vd = temp;
-}
+    if (oprsz == 32) {
+        for(i = m; i < 2 * m; i++) {
+            uint64_t j = (uint8_t)Va->B(i) % (2 * m);
+            temp.B(i) = j < m ? Vk->B(j + m) : Vj->B(j);
+        }
+    }
 
-#define VSHUF(NAME, BIT, E)                              \
-void HELPER(NAME)(CPULoongArchState *env,                \
-                  uint32_t vd, uint32_t vj, uint32_t vk) \
-{                                                        \
-    int i, m;                                            \
-    VReg temp;                                           \
-    VReg *Vd = &(env->fpr[vd].vreg);                     \
-    VReg *Vj = &(env->fpr[vj].vreg);                     \
-    VReg *Vk = &(env->fpr[vk].vreg);                     \
-                                                         \
-    m = LSX_LEN/BIT;                                     \
-    for (i = 0; i < m; i++) {                            \
-        uint64_t k  = ((uint8_t) Vd->E(i)) % (2 * m);    \
-        temp.E(i) = k < m ? Vk->E(k) : Vj->E(k - m);     \
-    }                                                    \
-    *Vd = temp;                                          \
+    *Vd = temp;
 }
 
-VSHUF(vshuf_h, 16, H)
-VSHUF(vshuf_w, 32, W)
-VSHUF(vshuf_d, 64, D)
-
-#define VSHUF4I(NAME, BIT, E)                             \
-void HELPER(NAME)(CPULoongArchState *env,                 \
-                  uint32_t vd, uint32_t vj, uint32_t imm) \
+#define VSHUF(NAME, BIT, E)                               \
+void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \
+                  uint32_t vd, uint32_t vj, uint32_t vk)  \
 {                                                         \
-    int i;                                                \
+    int i, m;                                             \
     VReg temp;                                            \
     VReg *Vd = &(env->fpr[vd].vreg);                      \
     VReg *Vj = &(env->fpr[vj].vreg);                      \
+    VReg *Vk = &(env->fpr[vk].vreg);                      \
                                                           \
-    for (i = 0; i < LSX_LEN/BIT; i++) {                   \
-         temp.E(i) = Vj->E(((i) & 0xfc) + (((imm) >>      \
-                           (2 * ((i) & 0x03))) & 0x03));  \
+    m = LSX_LEN / BIT;                                    \
+    for (i = 0; i < m; i++) {                             \
+        uint64_t k  = (uint8_t)Vd->E(i) % (2 * m);        \
+        temp.E(i) = k < m ? Vk->E(k) : Vj->E(k - m);      \
+    }                                                     \
+    if (oprsz == 32) {                                    \
+        for (i = m; i < 2 * m; i++) {                     \
+            uint64_t j = (uint8_t)Vd->E(i) % (2 * m);     \
+            temp.E(i) = j < m ? Vk->E(j + m): Vj->E(j);   \
+        }                                                 \
     }                                                     \
     *Vd = temp;                                           \
 }
 
+VSHUF(vshuf_h, 16, H)
+VSHUF(vshuf_w, 32, W)
+VSHUF(vshuf_d, 64, D)
+
+#define VSHUF4I(NAME, BIT, E)                               \
+void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz,   \
+                  uint32_t vd, uint32_t vj, uint32_t imm)   \
+{                                                           \
+    int i, max;                                             \
+    VReg temp;                                              \
+    VReg *Vd = &(env->fpr[vd].vreg);                        \
+    VReg *Vj = &(env->fpr[vj].vreg);                        \
+                                                            \
+    max = LSX_LEN / BIT;                                    \
+    for (i = 0; i < max; i++) {                             \
+        temp.E(i) = Vj->E(SHF_POS(i, imm));                 \
+    }                                                       \
+    if (oprsz == 32) {                                      \
+        for (i = max; i < 2 * max; i++) {                   \
+            temp.E(i) = Vj->E(SHF_POS(i - max, imm) + max); \
+        }                                                   \
+    }                                                       \
+    *Vd = temp;                                             \
+}
+
 VSHUF4I(vshuf4i_b, 8, B)
 VSHUF4I(vshuf4i_h, 16, H)
 VSHUF4I(vshuf4i_w, 32, W)
 
-void HELPER(vshuf4i_d)(CPULoongArchState *env,
+void HELPER(vshuf4i_d)(CPULoongArchState *env, uint32_t oprsz,
                        uint32_t vd, uint32_t vj, uint32_t imm)
 {
+    int i, max;
     VReg *Vd = &(env->fpr[vd].vreg);
     VReg *Vj = &(env->fpr[vj].vreg);
 
     VReg temp;
-    temp.D(0) = (imm & 2 ? Vj : Vd)->D(imm & 1);
-    temp.D(1) = (imm & 8 ? Vj : Vd)->D((imm >> 2) & 1);
+    max = (oprsz == 16) ? 1 : 2;
+    for (i = 0; i < max; i++) {
+        temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i);
+        temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i);
+    }
+    *Vd = temp;
+}
+
+void HELPER(vperm_w)(CPULoongArchState *env, uint32_t oprsz,
+                     uint32_t vd, uint32_t vj, uint32_t vk)
+{
+    int i, m;
+    VReg temp;
+    VReg *Vd = &(env->fpr[vd].vreg);
+    VReg *Vj = &(env->fpr[vj].vreg);
+    VReg *Vk = &(env->fpr[vk].vreg);
+
+    m = LASX_LEN / 32;
+    for (i = 0; i < m ; i++) {
+        uint64_t k = (uint8_t)Vk->W(i) % 8;
+        temp.W(i) = Vj->W(k);
+    }
+    *Vd = temp;
+}
+
+void HELPER(vpermi_w)(CPULoongArchState *env, uint32_t oprsz,
+                      uint32_t vd, uint32_t vj, uint32_t imm)
+{
+    int i, max;
+    VReg temp;
+    VReg *Vd = &(env->fpr[vd].vreg);
+    VReg *Vj = &(env->fpr[vj].vreg);
+
+    max = (oprsz == 16) ? 1 : 2;
+
+    for (i = 0; i < max; i++) {
+        temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i);
+        temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i);
+        temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i);
+        temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i);
+    }
+    *Vd = temp;
+}
+
+void HELPER(vpermi_d)(CPULoongArchState *env, uint32_t oprsz,
+                      uint32_t vd, uint32_t vj, uint32_t imm)
+{
+    VReg temp;
+    VReg *Vd = &(env->fpr[vd].vreg);
+    VReg *Vj = &(env->fpr[vj].vreg);
+
+    temp.D(0) = Vj->D(imm & 0x3);
+    temp.D(1) = Vj->D((imm >> 2) & 0x3);
+    temp.D(2) = Vj->D((imm >> 4) & 0x3);
+    temp.D(3) = Vj->D((imm >> 6) & 0x3);
     *Vd = temp;
 }
 
-void HELPER(vpermi_w)(CPULoongArchState *env,
+void HELPER(vpermi_q)(CPULoongArchState *env, uint32_t oprsz,
                       uint32_t vd, uint32_t vj, uint32_t imm)
 {
     VReg temp;
     VReg *Vd = &(env->fpr[vd].vreg);
     VReg *Vj = &(env->fpr[vj].vreg);
 
-    temp.W(0) = Vj->W(imm & 0x3);
-    temp.W(1) = Vj->W((imm >> 2) & 0x3);
-    temp.W(2) = Vd->W((imm >> 4) & 0x3);
-    temp.W(3) = Vd->W((imm >> 6) & 0x3);
+    temp.Q(0) = (imm & 0x3) > 1 ? Vd->Q((imm & 0x3) - 2) : Vj->Q(imm & 0x3);
+    temp.Q(1) = ((imm >> 4) & 0x3) > 1 ? Vd->Q(((imm >> 4) & 0x3) - 2) :
+                                         Vj->Q((imm >> 4) & 0x3);
     *Vd = temp;
 }
 
 #define VEXTRINS(NAME, BIT, E, MASK)                      \
-void HELPER(NAME)(CPULoongArchState *env,                 \
+void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \
                   uint32_t vd, uint32_t vj, uint32_t imm) \
 {                                                         \
-    int ins, extr;                                        \
+    int ins, extr, max;                                   \
     VReg *Vd = &(env->fpr[vd].vreg);                      \
     VReg *Vj = &(env->fpr[vj].vreg);                      \
                                                           \
+    max = LSX_LEN / BIT;                                  \
     ins = (imm >> 4) & MASK;                              \
     extr = imm & MASK;                                    \
     Vd->E(ins) = Vj->E(extr);                             \
+    if (oprsz == 32) {                                    \
+        Vd->E(ins + max) = Vj->E(extr + max);             \
+    }                                                     \
 }
 
 VEXTRINS(vextrins_b, 8, B, 0xf)
-- 
2.39.1

[PATCH v2 44/46] target/loongarch: Implement xvshuf xvperm{i} xvshuf4i xvextrins

Reply via email to