Re: [PATCH v1] LoongArch: Add builtin interfaces for 128 and 256 vector conversions.

Lulu Cheng Wed, 29 Oct 2025 18:48:58 -0700

Hi, Wangrui:

Please double-check if the patch is available.


Thanks!

在 2025/10/29 下午6:49, chenxiaolong 写道:

gcc/ChangeLog:

        * config/loongarch/lasx.md (vec_cast<mode>): New template
        implemention.
        (vec_insert_lo_<mode>): Dito.
        (vec_insert_hi_<mode>): Dito.
        * config/loongarch/lasxintrin.h (defined): Test for adding
        the builtin function.
        (__lasx_cast_128_s): Dito.
        (__lasx_cast_128_d): Dito.
        (__lasx_cast_128): Dito.
        (__lasx_concat_128_s): Dito.
        (__lasx_concat_128_d): Dito.
        (__lasx_concat_128): Dito.
        (__lasx_extract_128_lo_s): Dito.
        (__lasx_extract_128_hi_s): Dito.
        (__lasx_extract_128_lo_d): Dito.
        (__lasx_extract_128_hi_d): Dito.
        (__lasx_extract_128_lo): Dito.
        (__lasx_extract_128_hi): Dito.
        (__lasx_insert_128_lo_s): Dito.
        (__lasx_insert_128_hi_s): Dito.
        (__lasx_insert_128_lo_d): Dito.
        (__lasx_insert_128_hi_d): Dito.
        (__lasx_insert_128_lo): Dito.
        (__lasx_insert_128_hi): Dito.
        * config/loongarch/loongarch-builtins.cc
        (CODE_FOR_lasx_extract_128_lo_s): Add builtins and register
        icode.
        (CODE_FOR_lasx_extract_128_hi_s): Dito.
        (CODE_FOR_lasx_extract_128_lo_d): Dito.
        (CODE_FOR_lasx_extract_128_hi_d): Dito.
        (CODE_FOR_lasx_extract_128_lo): Dito.
        (CODE_FOR_lasx_extract_128_hi): Dito.
        (CODE_FOR_lasx_insert_128_lo_s): Dito.
        (CODE_FOR_lasx_insert_128_hi_s): Dito.
        (CODE_FOR_lasx_insert_128_lo_d): Dito.
        (CODE_FOR_lasx_insert_128_hi_d): Dito.
        (CODE_FOR_lasx_insert_128_lo): Dito.
        (CODE_FOR_lasx_insert_128_hi): Dito.
        (CODE_FOR_lasx_concat_128_s): Dito.
        (CODE_FOR_lasx_concat_128_d): Dito.
        (CODE_FOR_lasx_concat_128): Dito.
        (CODE_FOR_lasx_cast_128_s): Dito.
        (CODE_FOR_lasx_cast_128_d): Dito.
        (CODE_FOR_lasx_cast_128): Dito.
        (loongarch_expand_builtin_direct): For the newly added
        insertion or extraction, construct the parallel parameter
        corresponding to the operand.
        * config/loongarch/loongarch-c.cc
        (loongarch_update_cpp_builtins): Define
        __loongarch_asx_sx_conv.
        * config/loongarch/loongarch-ftypes.def: Declare the type
        of the builtin function.
        * doc/extend.texi: Add document description.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c: New 
test.
        * gcc.target/loongarch/vector/lasx/vect-concat-128-256.c: New test.
        * gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c: New 
test.
        * gcc.target/loongarch/vector/lasx/vect-extract-256-128.c: New test.
        * gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c: New 
test.
        * gcc.target/loongarch/vector/lasx/vect-insert-128-256.c: New test.

Change-Id: I44edabf9d1eca4a9627697cb586de73833e4923a
---
  gcc/config/loongarch/lasx.md                  |  36 +++
  gcc/config/loongarch/lasxintrin.h             | 156 +++++++++++
  gcc/config/loongarch/loongarch-builtins.cc    |  90 +++++-
  gcc/config/loongarch/loongarch-c.cc           |   1 +
  gcc/config/loongarch/loongarch-ftypes.def     |  12 +
  gcc/doc/extend.texi                           | 258 +++++++++++++++++-
  .../vector/lasx/vect-concat-128-256-result.c  |  68 +++++
  .../vector/lasx/vect-concat-128-256.c         |  92 +++++++
  .../vector/lasx/vect-extract-256-128-result.c |  69 +++++
  .../vector/lasx/vect-extract-256-128.c        |  86 ++++++
  .../vector/lasx/vect-insert-128-256-result.c  |  97 +++++++
  .../vector/lasx/vect-insert-128-256.c         |  95 +++++++
  12 files changed, 1058 insertions(+), 2 deletions(-)
  create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c
  create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c
  create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c
  create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c
  create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c
  create mode 100644 
gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 3048c483408..80278d0fb7e 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -130,6 +130,7 @@ (define_mode_iterator LASX_D [V4DI V4DF])

;; Only used for splitting insert_d and copy_{u,s}.d.

  (define_mode_iterator LASX_WD [V4DI V4DF V8SI V8SF])
+(define_mode_iterator LASX_PART [V4DI V4DF V8SF])

;; Only used for copy256_{u,s}.w.

  (define_mode_iterator LASX_W    [V8SI V8SF])
@@ -675,6 +676,41 @@ (define_insn "vec_extract_hi_<mode>"
    [(set_attr "move_type" "fmove")
     (set_attr "mode" "<MODE>")])

+;; vr0 -> low xr0

+;;
+(define_insn "vec_cast<mode>"
+  [(set (match_operand:LASX_PART 0 "register_operand" "=f")
+       (subreg:LASX_PART
+         (match_operand:<VHMODE256_ALL> 1 "register_operand" "0") 0))]
+  "ISA_HAS_LASX"
+  ""
+  [(set_attr "type" "simd_splat")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "vec_insert_lo_<mode>"
+  [(set (match_operand:LASX_PART 0 "register_operand" "=f")
+    (vec_concat:LASX_PART
+      (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")
+      (vec_select:<VHMODE256_ALL>
+       (match_operand:LASX_PART 1 "register_operand" "0")
+       (match_operand:LASX_PART 3 "vect_par_cnst_high_half"))))]
+  "ISA_HAS_LASX"
+  "xvpermi.q\t%u0,%u2,0x30"
+  [(set_attr "type" "simd_splat")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "vec_insert_hi_<mode>"
+  [(set (match_operand:LASX_PART 0 "register_operand" "=f")
+    (vec_concat:LASX_PART
+      (vec_select:<VHMODE256_ALL>
+       (match_operand:LASX_PART 1 "register_operand" "0")
+       (match_operand:LASX_PART 3 "vect_par_cnst_low_half"))
+      (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")))]
+  "ISA_HAS_LASX"
+  "xvpermi.q\t%u0,%u2,0x02"
+  [(set_attr "type" "simd_splat")
+   (set_attr "mode" "<MODE>")])
+
  (define_expand "vec_perm<mode>"
   [(match_operand:LASX 0 "register_operand")
    (match_operand:LASX 1 "register_operand")
diff --git a/gcc/config/loongarch/lasxintrin.h 
b/gcc/config/loongarch/lasxintrin.h
index 6bcffc26d4a..6c34edeec25 100644
--- a/gcc/config/loongarch/lasxintrin.h
+++ b/gcc/config/loongarch/lasxintrin.h
@@ -23,6 +23,8 @@
     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     <http://www.gnu.org/licenses/>.  */

+#include <lsxintrin.h>

+
  #ifndef _GCC_LOONGSON_ASXINTRIN_H
  #define _GCC_LOONGSON_ASXINTRIN_H 1

@@ -5368,5 +5370,159 @@ __m256i __lasx_xvfcmp_sun_s (__m256 _1, __m256 _2)

  #define __lasx_xvrepli_w(/*si10*/ _1) \
    ((__m256i)__builtin_lasx_xvrepli_w ((_1)))

+#if defined (__loongarch_asx_sx_conv)

+/* Add builtin interfaces for 128 and 256 vector conversions.
+   For the assembly instruction format of some functions of the following 
vector
+   conversion, it is not described exactly in accordance with the format of the
+   generated assembly instruction.
+   In the front end of the Rust language, different built-in functions are 
called
+   by analyzing the format of assembly instructions. The data types of 
instructions
+   are all defined based on the interfaces of the defined functions, in the
+   following order: output, input... .  */
+/* Assembly instruction format:        xd, vj.  */
+/* Data types in instruction templates:  V8SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256 __lasx_cast_128_s (__m128 _1)
+{
+  return  (__m256)__builtin_lasx_cast_128_s ((v4f32)_1);
+}
+
+/* Assembly instruction format:        xd, vj.  */
+/* Data types in instruction templates:  V4DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256d __lasx_cast_128_d (__m128d _1)
+{
+  return  (__m256d)__builtin_lasx_cast_128_d ((v2f64)_1);
+}
+
+/* Assembly instruction format:        xd, vj.  */
+/* Data types in instruction templates:  V4DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256i __lasx_cast_128 (__m128i _1)
+{
+  return  (__m256i)__builtin_lasx_cast_128 ((v2i64)_1);
+}
+
+/* Assembly instruction format:        xd, vj, vk.  */
+/* Data types in instruction templates:  V8SF, V4SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256 __lasx_concat_128_s (__m128 _1, __m128 _2)
+{
+  return  (__m256)__builtin_lasx_concat_128_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:        xd, vj, vk.  */
+/* Data types in instruction templates:  V4DF, V2DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256d __lasx_concat_128_d (__m128d _1, __m128d _2)
+{
+  return  (__m256d)__builtin_lasx_concat_128_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:        xd, vj, vk.  */
+/* Data types in instruction templates:  V4DI, V2DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256i __lasx_concat_128 (__m128i _1, __m128i _2)
+{
+  return  (__m256i)__builtin_lasx_concat_128 ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:        vd, xj.  */
+/* Data types in instruction templates:  V4SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m128 __lasx_extract_128_lo_s (__m256 _1)
+{
+  return  (__m128)__builtin_lasx_extract_128_lo_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:        vd, xj.  */
+/* Data types in instruction templates:  V4SF, V8SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m128 __lasx_extract_128_hi_s (__m256 _1)
+{
+  return  (__m128)__builtin_lasx_extract_128_hi_s ((v8f32)_1);
+}
+
+/* Assembly instruction format:        vd, xj.  */
+/* Data types in instruction templates:  V2DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m128d __lasx_extract_128_lo_d (__m256d _1)
+{
+  return  (__m128d)__builtin_lasx_extract_128_lo_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:        vd, xj.  */
+/* Data types in instruction templates:  V2DF, V4DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m128d __lasx_extract_128_hi_d (__m256d _1)
+{
+  return  (__m128d)__builtin_lasx_extract_128_hi_d ((v4f64)_1);
+}
+
+/* Assembly instruction format:        vd, xj.  */
+/* Data types in instruction templates:  V2DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m128i __lasx_extract_128_lo (__m256i _1)
+{
+  return  (__m128i)__builtin_lasx_extract_128_lo ((v4i64)_1);
+}
+
+/* Assembly instruction format:        vd, xj.  */
+/* Data types in instruction templates:  V2DI, V4DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m128i __lasx_extract_128_hi (__m256i _1)
+{
+  return  (__m128i)__builtin_lasx_extract_128_hi ((v4i64)_1);
+}
+
+/* Assembly instruction format:        xd, xj, vk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256 __lasx_insert_128_lo_s (__m256 _1, __m128 _2)
+{
+  return  (__m256)__builtin_lasx_insert_128_lo_s ((v8f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:        xd, xj, vk.  */
+/* Data types in instruction templates:  V8SF, V8SF, V4SF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256 __lasx_insert_128_hi_s (__m256 _1, __m128 _2)
+{
+  return  (__m256)__builtin_lasx_insert_128_hi_s ((v8f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format:        xd, xj, vk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256d __lasx_insert_128_lo_d (__m256d _1, __m128d _2)
+{
+  return  (__m256d)__builtin_lasx_insert_128_lo_d ((v4f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:        xd, xj, vk.  */
+/* Data types in instruction templates:  V4DF, V4DF, V2DF.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256d __lasx_insert_128_hi_d (__m256d _1, __m128d _2)
+{
+  return  (__m256d)__builtin_lasx_insert_128_hi_d ((v4f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format:        xd, xj, vk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256i __lasx_insert_128_lo (__m256i _1, __m128i _2)
+{
+  return  (__m256i)__builtin_lasx_insert_128_lo ((v4i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format:        xd, xj, vk.  */
+/* Data types in instruction templates:  V4DI, V4DI, V2DI.  */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+__m256i __lasx_insert_128_hi (__m256i _1, __m128i _2)
+{
+  return  (__m256i)__builtin_lasx_insert_128_hi ((v4i64)_1, (v2i64)_2);
+}
+
+#endif /* defined(__loongarch_asx_sx_conv).  */
  #endif /* defined(__loongarch_asx).  */
  #endif /* _GCC_LOONGSON_ASXINTRIN_H.  */
diff --git a/gcc/config/loongarch/loongarch-builtins.cc 
b/gcc/config/loongarch/loongarch-builtins.cc
index 9493dedcab7..312d87626a4 100644
--- a/gcc/config/loongarch/loongarch-builtins.cc
+++ b/gcc/config/loongarch/loongarch-builtins.cc
@@ -865,6 +865,27 @@ AVAIL_ALL (lasx_frecipe, ISA_HAS_LASX && ISA_HAS_FRECIPE)
  #define CODE_FOR_lasx_xvmaddwod_q_du  CODE_FOR_lasx_maddwod_q_du_punned
  #define CODE_FOR_lasx_xvmaddwod_q_du_d        
CODE_FOR_lasx_maddwod_q_du_d_punned

+

+/* Add mutual conversion between 128 and 256 vectors.  */
+#define CODE_FOR_lasx_extract_128_lo_s CODE_FOR_vec_extract_lo_v8sf
+#define CODE_FOR_lasx_extract_128_hi_s CODE_FOR_vec_extract_hi_v8sf
+#define CODE_FOR_lasx_extract_128_lo_d CODE_FOR_vec_extract_lo_v4df
+#define CODE_FOR_lasx_extract_128_hi_d CODE_FOR_vec_extract_hi_v4df
+#define CODE_FOR_lasx_extract_128_lo   CODE_FOR_vec_extract_lo_v4di
+#define CODE_FOR_lasx_extract_128_hi   CODE_FOR_vec_extract_hi_v4di
+#define CODE_FOR_lasx_insert_128_lo_s  CODE_FOR_vec_insert_lo_v8sf
+#define CODE_FOR_lasx_insert_128_hi_s  CODE_FOR_vec_insert_hi_v8sf
+#define CODE_FOR_lasx_insert_128_lo_d  CODE_FOR_vec_insert_lo_v4df
+#define CODE_FOR_lasx_insert_128_hi_d  CODE_FOR_vec_insert_hi_v4df
+#define CODE_FOR_lasx_insert_128_lo    CODE_FOR_vec_insert_lo_v4di
+#define CODE_FOR_lasx_insert_128_hi    CODE_FOR_vec_insert_hi_v4di
+#define CODE_FOR_lasx_concat_128_s     CODE_FOR_vec_concatv8sf
+#define CODE_FOR_lasx_concat_128_d     CODE_FOR_vec_concatv4df
+#define CODE_FOR_lasx_concat_128       CODE_FOR_vec_concatv4di
+#define CODE_FOR_lasx_cast_128_s       CODE_FOR_vec_castv8sf
+#define CODE_FOR_lasx_cast_128_d       CODE_FOR_vec_castv4df
+#define CODE_FOR_lasx_cast_128         CODE_FOR_vec_castv4di
+
  static const struct loongarch_builtin_description loongarch_builtins[] = {
  #define LARCH_MOVFCSR2GR 0
    DIRECT_BUILTIN (movfcsr2gr, LARCH_USI_FTYPE_UQI, hard_float),
@@ -2407,7 +2428,25 @@ static const struct loongarch_builtin_description 
loongarch_builtins[] = {
    LASX_BUILTIN (xvssrarni_bu_h, LARCH_UV32QI_FTYPE_UV32QI_V32QI_USI),
    LASX_BUILTIN (xvssrarni_hu_w, LARCH_UV16HI_FTYPE_UV16HI_V16HI_USI),
    LASX_BUILTIN (xvssrarni_wu_d, LARCH_UV8SI_FTYPE_UV8SI_V8SI_USI),
-  LASX_BUILTIN (xvssrarni_du_q, LARCH_UV4DI_FTYPE_UV4DI_V4DI_USI)
+  LASX_BUILTIN (xvssrarni_du_q, LARCH_UV4DI_FTYPE_UV4DI_V4DI_USI),
+  LASX_BUILTIN (extract_128_lo_s, LARCH_V4SF_FTYPE_V8SF),
+  LASX_BUILTIN (extract_128_hi_s, LARCH_V4SF_FTYPE_V8SF),
+  LASX_BUILTIN (extract_128_lo_d, LARCH_V2DF_FTYPE_V4DF),
+  LASX_BUILTIN (extract_128_hi_d, LARCH_V2DF_FTYPE_V4DF),
+  LASX_BUILTIN (extract_128_lo, LARCH_V2DI_FTYPE_V4DI),
+  LASX_BUILTIN (extract_128_hi, LARCH_V2DI_FTYPE_V4DI),
+  LASX_BUILTIN (insert_128_lo_s, LARCH_V8SF_FTYPE_V8SF_V4SF),
+  LASX_BUILTIN (insert_128_hi_s, LARCH_V8SF_FTYPE_V8SF_V4SF),
+  LASX_BUILTIN (insert_128_lo_d, LARCH_V4DF_FTYPE_V4DF_V2DF),
+  LASX_BUILTIN (insert_128_hi_d, LARCH_V4DF_FTYPE_V4DF_V2DF),
+  LASX_BUILTIN (insert_128_lo, LARCH_V4DI_FTYPE_V4DI_V2DI),
+  LASX_BUILTIN (insert_128_hi, LARCH_V4DI_FTYPE_V4DI_V2DI),
+  LASX_BUILTIN (concat_128_s, LARCH_V8SF_FTYPE_V4SF_V4SF),
+  LASX_BUILTIN (concat_128_d, LARCH_V4DF_FTYPE_V2DF_V2DF),
+  LASX_BUILTIN (concat_128, LARCH_V4DI_FTYPE_V2DI_V2DI),
+  LASX_BUILTIN (cast_128_s, LARCH_V8SF_FTYPE_V4SF),
+  LASX_BUILTIN (cast_128_d, LARCH_V4DF_FTYPE_V2DF),
+  LASX_BUILTIN (cast_128, LARCH_V4DI_FTYPE_V2DI)
  };

/* Index I is the function declaration for loongarch_builtins[I], or null if

@@ -3001,6 +3040,10 @@ loongarch_expand_builtin_direct (enum insn_code icode, 
rtx target, tree exp,
  {
    struct expand_operand ops[MAX_RECOG_OPERANDS];
    int opno, argno;
+  /* For vector extraction/insertion operations, sel_high_p being true
+     indicates that the high of the data is selected/retained from the
+     vector register.  */
+  bool sel_high_p = true;

/* Map any target to operand 0. */

    opno = 0;
@@ -3019,6 +3062,51 @@ loongarch_expand_builtin_direct (enum insn_code icode, 
rtx target, tree exp,
        create_input_operand (&ops[1], CONST1_RTX (ops[0].mode), ops[0].mode);
        return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p);

+ case CODE_FOR_vec_extract_lo_v8sf:

+    case CODE_FOR_vec_extract_lo_v4df:
+    case CODE_FOR_vec_extract_lo_v4di:
+      sel_high_p = false;
+    /* Fall through.  */
+    case CODE_FOR_vec_extract_hi_v8sf:
+    case CODE_FOR_vec_extract_hi_v4df:
+    case CODE_FOR_vec_extract_hi_v4di:
+      {
+       /* The selection method for constructing the high/low half.  */
+       loongarch_prepare_builtin_arg (&ops[1], exp, 0);
+       int nelts = GET_MODE_NUNITS (GET_MODE (ops[1].value));
+       int half_nelts = nelts / 2;
+       int base = sel_high_p ? half_nelts : 0;
+
+       rtx pat_rtx
+           = loongarch_gen_stepped_int_parallel (half_nelts, base, 1);
+       create_input_operand (&ops[2], pat_rtx, ops[1].mode);
+
+       return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p);
+      }
+
+    case CODE_FOR_vec_insert_hi_v8sf:
+    case CODE_FOR_vec_insert_hi_v4df:
+    case CODE_FOR_vec_insert_hi_v4di:
+      sel_high_p = false;
+    /* Fall through.  */
+    case CODE_FOR_vec_insert_lo_v8sf:
+    case CODE_FOR_vec_insert_lo_v4df:
+    case CODE_FOR_vec_insert_lo_v4di:
+      {
+       /* The selection method for constructing the high/low half.  */
+       loongarch_prepare_builtin_arg (&ops[1], exp, 0);
+       loongarch_prepare_builtin_arg (&ops[2], exp, 1);
+       int nelts = GET_MODE_NUNITS (GET_MODE (ops[1].value));
+       int half_nelts = nelts / 2;
+       int base = sel_high_p ? half_nelts : 0;
+
+       rtx pat_rtx
+           = loongarch_gen_stepped_int_parallel (half_nelts, base, 1);
+       create_input_operand (&ops[3], pat_rtx, ops[1].mode);
+
+       return loongarch_expand_builtin_insn (icode, 4, ops, has_target_p);
+      }
+
      default:
        break;
      }
diff --git a/gcc/config/loongarch/loongarch-c.cc 
b/gcc/config/loongarch/loongarch-c.cc
index effdcf0e255..fc031a6fe90 100644
--- a/gcc/config/loongarch/loongarch-c.cc
+++ b/gcc/config/loongarch/loongarch-c.cc
@@ -132,6 +132,7 @@ loongarch_update_cpp_builtins (cpp_reader *pfile)
    loongarch_def_or_undef (ISA_HAS_LSX, "__loongarch_simd", pfile);
    loongarch_def_or_undef (ISA_HAS_LSX, "__loongarch_sx", pfile);
    loongarch_def_or_undef (ISA_HAS_LASX, "__loongarch_asx", pfile);
+  loongarch_def_or_undef (ISA_HAS_LASX, "__loongarch_asx_sx_conv", pfile);

builtin_undef ("__loongarch_simd_width");

    if (ISA_HAS_LSX)
diff --git a/gcc/config/loongarch/loongarch-ftypes.def 
b/gcc/config/loongarch/loongarch-ftypes.def
index 337f2c2c229..68b1b446182 100644
--- a/gcc/config/loongarch/loongarch-ftypes.def
+++ b/gcc/config/loongarch/loongarch-ftypes.def
@@ -42,6 +42,12 @@ DEF_LARCH_FTYPE (1, (USI, USI))
  DEF_LARCH_FTYPE (1, (UDI, USI))
  DEF_LARCH_FTYPE (1, (USI, UQI))
  DEF_LARCH_FTYPE (1, (VOID, USI))
+DEF_LARCH_FTYPE (1, (V4SF, V8SF))
+DEF_LARCH_FTYPE (1, (V2DF, V4DF))
+DEF_LARCH_FTYPE (1, (V2DI, V4DI))
+DEF_LARCH_FTYPE (1, (V8SF, V4SF))
+DEF_LARCH_FTYPE (1, (V4DF, V2DF))
+DEF_LARCH_FTYPE (1, (V4DI, V2DI))

DEF_LARCH_FTYPE (2, (VOID, UQI, USI))

  DEF_LARCH_FTYPE (2, (VOID, UHI, USI))
@@ -58,6 +64,12 @@ DEF_LARCH_FTYPE (2, (SI, SI, SI))
  DEF_LARCH_FTYPE (2, (SI, DI, SI))
  DEF_LARCH_FTYPE (2, (USI, USI, USI))
  DEF_LARCH_FTYPE (2, (UDI, UDI, USI))
+DEF_LARCH_FTYPE (2, (V8SF, V4SF, V4SF))
+DEF_LARCH_FTYPE (2, (V4DF, V2DF, V2DF))
+DEF_LARCH_FTYPE (2, (V4DI, V2DI, V2DI))
+DEF_LARCH_FTYPE (2, (V8SF, V8SF, V4SF))
+DEF_LARCH_FTYPE (2, (V4DF, V4DF, V2DF))
+DEF_LARCH_FTYPE (2, (V4DI, V4DI, V2DI))

DEF_LARCH_FTYPE (3, (VOID, USI, USI, SI))

  DEF_LARCH_FTYPE (3, (VOID, USI, UDI, SI))
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 882c0820a6e..ae7aaa8c85b 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -19688,7 +19688,16 @@ into the data cache.  The instruction is issued in 
slot I1@.

These built-in functions are available for LoongArch.-Data Type Description:

+@menu
+* Data Types::
+* Directly-mapped Builtin Functions::
+* Directly-mapped Division Builtin Functions::
+* Other Builtin Functions::
+@end menu
+
+@node Data Types
+@subsubsection Data Types
+
  @itemize
  @item @code{imm0_31}, a compile-time constant in range 0 to 31;
  @item @code{imm0_16383}, a compile-time constant in range 0 to 16383;
@@ -19696,6 +19705,9 @@ Data Type Description:
  @item @code{imm_n2048_2047}, a compile-time constant in range -2048 to 2047;
  @end itemize

+@node Directly-mapped Builtin Functions

+@subsubsection Directly-mapped Builtin Functions
+
  The intrinsics provided are listed below:
  @smallexample
      unsigned int __builtin_loongarch_movfcsr2gr (imm0_31)
@@ -19819,6 +19831,9 @@ function you need to include @code{larchintrin.h}.
      void __break (imm0_32767)
  @end smallexample

+@node Directly-mapped Division Builtin Functions

+@subsubsection Directly-mapped Division Builtin Functions
+
  These intrinsic functions are available by including @code{larchintrin.h} and
  using @option{-mfrecipe}.
  @smallexample
@@ -19828,6 +19843,9 @@ using @option{-mfrecipe}.
      double __frsqrte_d (double);
  @end smallexample

+@node Other Builtin Functions

+@subsubsection Other Builtin Functions
+
  Additional built-in functions are available for LoongArch family
  processors to efficiently use 128-bit floating-point (__float128)
  values.
@@ -19854,6 +19872,15 @@ GCC provides intrinsics to access the LSX (Loongson 
SIMD Extension) instructions
  The interface is made available by including @code{<lsxintrin.h>} and using
  @option{-mlsx}.

+@menu

+* SX Data Types::
+* Directly-mapped SX Builtin Functions::
+* Directly-mapped SX Division Builtin Functions::
+@end menu
+
+@node SX Data Types
+@subsubsection SX Data Types
+
  The following vectors typedefs are included in @code{lsxintrin.h}:

@itemize

@@ -19881,6 +19908,9 @@ input/output values manipulated:
  @item @code{imm_n2048_2047}, an integer literal in range -2048 to 2047.
  @end itemize

+@node Directly-mapped SX Builtin Functions

+@subsubsection Directly-mapped SX Builtin Functions
+
  For convenience, GCC defines functions @code{__lsx_vrepli_@{b/h/w/d@}} and
  @code{__lsx_b[n]z_@{v/b/h/w/d@}}, which are implemented as follows:

@@ -20664,6 +20694,9 @@ __m128i __lsx_vxori_b (__m128i, imm0_255);

  __m128i __lsx_vxor_v (__m128i, __m128i);
  @end smallexample

+@node Directly-mapped SX Division Builtin Functions

+@subsubsection Directly-mapped SX Division Builtin Functions
+
  These intrinsic functions are available by including @code{lsxintrin.h} and
  using @option{-mfrecipe} and @option{-mlsx}.
  @smallexample
@@ -20680,6 +20713,16 @@ GCC provides intrinsics to access the LASX (Loongson 
Advanced SIMD Extension)
  instructions. The interface is made available by including 
@code{<lasxintrin.h>}
  and using @option{-mlasx}.

+@menu

+* ASX Data Types::
+* Directly-mapped ASX Builtin Functions::
+* Directly-mapped ASX Division Builtin Functions::
+* Directly-mapped SX and ASX Conversion Builtin Functions::
+@end menu
+
+@node ASX Data Types
+@subsubsection ASX Data Types
+
  The following vectors typedefs are included in @code{lasxintrin.h}:

@itemize

@@ -20708,6 +20751,9 @@ input/output values manipulated:
  @item @code{imm_n2048_2047}, an integer literal in range -2048 to 2047.
  @end itemize

+@node Directly-mapped ASX Builtin Functions

+@subsubsection Directly-mapped ASX Builtin Functions
+
  For convenience, GCC defines functions @code{__lasx_xvrepli_@{b/h/w/d@}} and
  @code{__lasx_b[n]z_@{v/b/h/w/d@}}, which are implemented as follows:

@@ -21512,6 +21558,9 @@ __m256i __lasx_xvxori_b (__m256i, imm0_255);

  __m256i __lasx_xvxor_v (__m256i, __m256i);
  @end smallexample

+@node Directly-mapped ASX Division Builtin Functions

+@subsubsection Directly-mapped ASX Division Builtin Functions
+
  These intrinsic functions are available by including @code{lasxintrin.h} and
  using @option{-mfrecipe} and @option{-mlasx}.
  @smallexample
@@ -21521,6 +21570,213 @@ __m256d __lasx_xvfrsqrte_d (__m256d);
  __m256 __lasx_xvfrsqrte_s (__m256);
  @end smallexample

+@node Directly-mapped SX and ASX Conversion Builtin Functions

+@subsubsection Directly-mapped SX and ASX Conversion Builtin Functions
+
+For convenience, the @code{lsxintrin.h} file was imported into @code{
+lasxintrin.h} and 18 new interface functions for 128 and 256 vector
+conversions were added, using the @option{-mlasx} option.
+@smallexample
+__m256 __lasx_cast_128_s (__m128);
+__m256d __lasx_cast_128_d (__m128d);
+__m256i __lasx_cast_128 (__m128i);
+__m256 __lasx_concat_128_s (__m128, __m128);
+__m256d __lasx_concat_128_d (__m128d, __m128d);
+__m256i __lasx_concat_128 (__m128i, __m128i);
+__m128 __lasx_extract_128_lo_s (__m256);
+__m128 __lasx_extract_128_hi_s (__m256);
+__m128d __lasx_extract_128_lo_d (__m256d);
+__m128d __lasx_extract_128_hi_d (__m256d);
+__m128i __lasx_extract_128_lo (__m256i);
+__m128i __lasx_extract_128_hi (__m256i);
+__m256 __lasx_insert_128_lo_s (__m256, __m128);
+__m256 __lasx_insert_128_hi_s (__m256, __m128);
+__m256d __lasx_insert_128_lo_d (__m256d, __m128d);
+__m256d __lasx_insert_128_hi_d (__m256d, __m128d);
+__m256i __lasx_insert_128_lo (__m256i, __m128i);
+__m256i __lasx_insert_128_hi (__m256i, __m128i);
+@end smallexample
+
+When gcc does not support interfaces for 128 and 256 conversions,
+use the following code for equivalent substitution.
+
+@smallexample
+
+  #ifndef __loongarch_asx_sx_conv
+
+  #include <lasxintrin.h>
+  #include <lsxintrin.h>
+  __m256 inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_cast_128_s (__m128 src)
+  @{
+    __m256 dest;
+    asm ("" : "=f"(dest) : "0"(src));
+    return dest;
+  @}
+
+  __m256d inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_cast_128_d (__m128d src)
+  @{
+    __m256d dest;
+    asm        ("" : "=f"(dest) : "0"(src));
+    return dest;
+  @}
+
+  __m256i inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_cast_128 (__m128i src)
+  @{
+    __m256i dest;
+    asm        ("" : "=f"(dest) : "0"(src));
+    return dest;
+  @}
+
+  __m256 inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_concat_128_s (__m128 src1, __m128 src2)
+  @{
+    __m256 dest;
+    asm        ("xvpermi.q %u0,%u2,0x02\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256d inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_concat_128_d (__m128d src1, __m128d src2)
+  @{
+    __m256d dest;
+    asm        ("xvpermi.q %u0,%u2,0x02\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256i inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_concat_128 (__m128i src1, __m128i src2)
+  @{
+    __m256i dest;
+    asm        ("xvpermi.q %u0,%u2,0x02\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m128 inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_extract_128_lo_s (__m256 src)
+  @{
+    __m128 dest;
+    asm        ("" : "=f"(dest) : "0"(src));
+    return dest;
+  @}
+
+  __m128d inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_extract_128_lo_d (__m256d src)
+  @{
+    __m128d dest;
+    asm        ("" : "=f"(dest) : "0"(src));
+    return dest;
+  @}
+
+  __m128i inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_extract_128_lo (__m256i src)
+  @{
+    __m128i dest;
+    asm        ("" : "=f"(dest) : "0"(src));
+    return dest;
+  @}
+
+  __m128 inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_extract_128_hi_s (__m256 src)
+  @{
+    __m128 dest;
+    asm        ("xvpermi.d %u0,%u1,0xe\n"
+          : "=f"(dest)
+          : "f"(src));
+    return dest;
+  @}
+
+  __m128d inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_extract_128_hi_d (__m256d src)
+  @{
+    __m128d dest;
+    asm        ("xvpermi.d %u0,%u1,0xe\n"
+          : "=f"(dest)
+          : "f"(src));
+    return dest;
+  @}
+
+  __m128i inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_extract_128_hi (__m256i src)
+  @{
+    __m128i dest;
+    asm        ("xvpermi.d %u0,%u1,0xe\n"
+          : "=f"(dest)
+          : "f"(src));
+    return dest;
+  @}
+
+  __m256 inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_insert_128_lo_s (__m256 src1, __m128 src2)
+  @{
+    __m256 dest;
+    asm        ("xvpermi.q %u0,%u2,0x30\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256d inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_insert_128_lo_d (__m256d a, __m128d b)
+  @{
+    __m256d dest;
+    asm        ("xvpermi.q %u0,%u2,0x30\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256i inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_insert_128_lo (__m256i src1, __m128i src2)
+  @{
+    __m256i dest;
+    asm        ("xvpermi.q %u0,%u2,0x30\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256 inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_insert_128_hi_s (__m256 src1, __m128 src2)
+  @{
+    __m256 dest;
+    asm        ("xvpermi.q %u0,%u2,0x02\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256d inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_insert_128_hi_d (__m256d src1, __m128d src2)
+  @{
+    __m256d dest;
+    asm        ("xvpermi.q %u0,%u2,0x02\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+
+  __m256i inline __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+  __lasx_insert_128_hi (__m256i src1, __m128i src2)
+  @{
+    __m256i dest;
+    asm        ("xvpermi.q %u0,%u2,0x02\n"
+          : "=f"(dest)
+          : "0"(src1), "f"(src2));
+    return dest;
+  @}
+  #endif
+
+@end smallexample
+
  @node MIPS DSP Built-in Functions
  @subsection MIPS DSP Built-in Functions

diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c

new file mode 100644
index 00000000000..e876c4a0b1a
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c
@@ -0,0 +1,68 @@
+/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+
+int
+main ()
+{
+  __m128i __m128i_op0, __m128i_op1, __m128i_op2, __m128i_out, __m128i_result;
+  __m128 __m128_op0, __m128_op1, __m128_op2, __m128_out, __m128_result;
+  __m128d __m128d_op0, __m128d_op1, __m128d_op2, __m128d_out, __m128d_result;
+
+  __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+  __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result;
+  __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result;
+
+  //__m128_op0={1,2,3,4},__m128_op1={5,6,7,8};
+  *((int *)&__m128_op0[3]) = 0x40800000;
+  *((int *)&__m128_op0[2]) = 0x40400000;
+  *((int *)&__m128_op0[1]) = 0x40000000;
+  *((int *)&__m128_op0[0]) = 0x3f800000;
+  *((int *)&__m128_op1[3]) = 0x41000000;
+  *((int *)&__m128_op1[2]) = 0x40e00000;
+  *((int *)&__m128_op1[1]) = 0x40c00000;
+  *((int *)&__m128_op1[0]) = 0x40a00000;
+  *((int *)&__m256_result[7]) = 0x41000000;
+  *((int *)&__m256_result[6]) = 0x40e00000;
+  *((int *)&__m256_result[5]) = 0x40c00000;
+  *((int *)&__m256_result[4]) = 0x40a00000;
+  *((int *)&__m256_result[3]) = 0x40800000;
+  *((int *)&__m256_result[2]) = 0x40400000;
+  *((int *)&__m256_result[1]) = 0x40000000;
+  *((int *)&__m256_result[0]) = 0x3f800000;
+  __m256_out = __lasx_concat_128_s (__m128_op0, __m128_op1);
+  ASSERTEQ_32 (__LINE__, __m256_result, __m256_out);
+  __m256_out = __lasx_cast_128_s (__m128_op0);
+  ASSERTEQ_32 (__LINE__, __m256_out, __m128_op0);
+
+  //__m128i_op0={1,2},__m128i_op1={3,4};
+  *((unsigned long *)&__m128i_op0[1]) = 0x2;
+  *((unsigned long *)&__m128i_op0[0]) = 0x1;
+  *((unsigned long *)&__m128i_op1[1]) = 0x4;
+  *((unsigned long *)&__m128i_op1[0]) = 0x3;
+  *((unsigned long *)&__m256i_result[3]) = 0x4;
+  *((unsigned long *)&__m256i_result[2]) = 0x3;
+  *((unsigned long *)&__m256i_result[1]) = 0x2;
+  *((unsigned long *)&__m256i_result[0]) = 0x1;
+  __m256i_out = __lasx_concat_128 (__m128i_op0, __m128i_op1);
+  ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+  __m256i_out = __lasx_cast_128 (__m128i_op0);
+  ASSERTEQ_64 (__LINE__, __m256i_out, __m128i_op0);
+
+  //__m128d_op0={1,2},__m128i_op1={3,4};
+  *((unsigned long *)&__m128d_op0[1]) = 0x4000000000000000;
+  *((unsigned long *)&__m128d_op0[0]) = 0x3ff0000000000000;
+  *((unsigned long *)&__m128d_op1[1]) = 0x4010000000000000;
+  *((unsigned long *)&__m128d_op1[0]) = 0x4008000000000000;
+  *((unsigned long *)&__m256d_result[3]) = 0x4010000000000000;
+  *((unsigned long *)&__m256d_result[2]) = 0x4008000000000000;
+  *((unsigned long *)&__m256d_result[1]) = 0x4000000000000000;
+  *((unsigned long *)&__m256d_result[0]) = 0x3ff0000000000000;
+  __m256d_out = __lasx_concat_128_d (__m128d_op0, __m128d_op1);
+  ASSERTEQ_64 (__LINE__, __m256d_result, __m256d_out);
+  __m256d_out = __lasx_cast_128_d (__m128d_op0);
+  ASSERTEQ_64 (__LINE__, __m256d_out, __m128d_op0);
+
+  return 0;
+}
diff --git 
a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c
new file mode 100644
index 00000000000..5d8cbb2e68f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c
@@ -0,0 +1,92 @@
+/* { dg-do compile { target { loongarch64*-*-* } } } */
+/* { dg-options "-mabi=lp64d -O2 -mlasx" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <lasxintrin.h>
+
+/*
+**foo1:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,1
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r8,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x02
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256
+foo1 (__m128 x, __m128 y)
+{
+  return __builtin_lasx_concat_128_s (x, y);
+}
+
+/*
+**foo2:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,1
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r8,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x02
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256d
+foo2 (__m128d x, __m128d y)
+{
+  return __builtin_lasx_concat_128_d (x, y);
+}
+
+/*
+**foo3:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,1
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r8,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x02
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256i
+foo3 (__m128i x, __m128i y)
+{
+  return __builtin_lasx_concat_128 (x, y);
+}
+
+/*
+**foo4:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,1
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256
+foo4 (__m128 x)
+{
+  return __builtin_lasx_cast_128_s (x);
+}
+
+/*
+**foo5:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,1
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256d
+foo5 (__m128d x)
+{
+  return __builtin_lasx_cast_128_d (x);
+}
+
+/*
+**foo6:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,1
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256i
+foo6 (__m128i x)
+{
+  return __builtin_lasx_cast_128 (x);
+}
diff --git 
a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c
new file mode 100644
index 00000000000..61064d6e1b7
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c
@@ -0,0 +1,69 @@
+/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+
+extern void abort (void);
+int
+main ()
+{
+  __m128i __m128i_result0, __m128i_result1, __m128i_out, __m128i_result;
+  __m128 __m128_result0, __m128_result1, __m128_out, __m128_result;
+  __m128d __m128d_result0, __m128d_result1, __m128d_out, __m128d_result;
+
+  __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+  __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result;
+  __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result;
+
+  //__m256_op0 = {1,2,3,4,5,6,7,8};
+  *((int *)&__m256_op0[7]) = 0x41000000;
+  *((int *)&__m256_op0[6]) = 0x40e00000;
+  *((int *)&__m256_op0[5]) = 0x40c00000;
+  *((int *)&__m256_op0[4]) = 0x40a00000;
+  *((int *)&__m256_op0[3]) = 0x40800000;
+  *((int *)&__m256_op0[2]) = 0x40400000;
+  *((int *)&__m256_op0[1]) = 0x40000000;
+  *((int *)&__m256_op0[0]) = 0x3f800000;
+  *((int *)&__m128_result1[3]) = 0x41000000;
+  *((int *)&__m128_result1[2]) = 0x40e00000;
+  *((int *)&__m128_result1[1]) = 0x40c00000;
+  *((int *)&__m128_result1[0]) = 0x40a00000;
+  *((int *)&__m128_result0[3]) = 0x40800000;
+  *((int *)&__m128_result0[2]) = 0x40400000;
+  *((int *)&__m128_result0[1]) = 0x40000000;
+  *((int *)&__m128_result0[0]) = 0x3f800000;
+  __m128_out = __lasx_extract_128_lo_s (__m256_op0);
+  ASSERTEQ_32 (__LINE__, __m128_result0, __m128_out);
+  __m128_out = __lasx_extract_128_hi_s (__m256_op0);
+  ASSERTEQ_32 (__LINE__, __m128_result1, __m128_out);
+
+  //__m256i_op0 = {1,2,3,4};
+  *((unsigned long *)&__m256i_op0[3]) = 0x4;
+  *((unsigned long *)&__m256i_op0[2]) = 0x3;
+  *((unsigned long *)&__m256i_op0[1]) = 0x2;
+  *((unsigned long *)&__m256i_op0[0]) = 0x1;
+  *((unsigned long *)&__m128i_result0[1]) = 0x2;
+  *((unsigned long *)&__m128i_result0[0]) = 0x1;
+  *((unsigned long *)&__m128i_result1[1]) = 0x4;
+  *((unsigned long *)&__m128i_result1[0]) = 0x3;
+  __m128i_out = __lasx_extract_128_lo (__m256i_op0);
+  ASSERTEQ_64 (__LINE__, __m128i_result0, __m128i_out);
+  __m128i_out = __lasx_extract_128_hi (__m256i_op0);
+  ASSERTEQ_64 (__LINE__, __m128i_result1, __m128i_out);
+
+  //__m256d_op0 = {1,2,3,4};
+  *((unsigned long *)&__m256d_op0[3]) = 0x4010000000000000;
+  *((unsigned long *)&__m256d_op0[2]) = 0x4008000000000000;
+  *((unsigned long *)&__m256d_op0[1]) = 0x4000000000000000;
+  *((unsigned long *)&__m256d_op0[0]) = 0x3ff0000000000000;
+  *((unsigned long *)&__m128d_result0[1]) = 0x4000000000000000;
+  *((unsigned long *)&__m128d_result0[0]) = 0x3ff0000000000000;
+  *((unsigned long *)&__m128d_result1[1]) = 0x4010000000000000;
+  *((unsigned long *)&__m128d_result1[0]) = 0x4008000000000000;
+  __m128d_out = __lasx_extract_128_lo_d (__m256d_op0);
+  ASSERTEQ_64 (__LINE__, __m128d_result0, __m128d_out);
+  __m128d_out = __lasx_extract_128_hi_d (__m256d_op0);
+  ASSERTEQ_64 (__LINE__, __m128d_result1, __m128d_out);
+
+  return 0;
+}
diff --git 
a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c
new file mode 100644
index 00000000000..d2219ea82de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c
@@ -0,0 +1,86 @@
+/* { dg-do compile { target { loongarch64*-*-* } } } */
+/* { dg-options "-mabi=lp64d -O2 -mlasx" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <lasxintrin.h>
+
+/*
+**foo1_lo:
+**     vld     (\$vr[0-9]+),\$r4,0
+**     vpickve2gr.du   \$r4,(\$vr[0-9]+),0
+**     vpickve2gr.du   \$r5,(\$vr[0-9]+),1
+**     jr      \$r1
+*/
+__m128
+foo1_lo (__m256 x)
+{
+  return __lasx_extract_128_lo_s (x);
+}
+
+/*
+**foo1_hi:
+**     xvld    (\$xr[0-9]+),\$r4,0
+**     xvpermi.d       (\$xr[0-9]+),(\$xr[0-9]+),0xe
+**     vpickve2gr.du   \$r4,(\$vr[0-9]+),0
+**     vpickve2gr.du   \$r5,(\$vr[0-9]+),1
+**     jr      \$r1
+*/
+__m128
+foo1_hi (__m256 x)
+{
+  return __lasx_extract_128_hi_s (x);
+}
+
+/*
+**foo2_lo:
+**     vld     (\$vr[0-9]+),\$r4,0
+**     vpickve2gr.du   \$r4,(\$vr[0-9]+),0
+**     vpickve2gr.du   \$r5,(\$vr[0-9]+),1
+**     jr      \$r1
+*/
+__m128d
+foo2_lo (__m256d x)
+{
+  return __lasx_extract_128_lo_d (x);
+}
+
+/*
+**foo2_hi:
+**     xvld    (\$xr[0-9]+),\$r4,0
+**     xvpermi.d       (\$xr[0-9]+),(\$xr[0-9]+),0xe
+**     vpickve2gr.du   \$r4,(\$vr[0-9]+),0
+**     vpickve2gr.du   \$r5,(\$vr[0-9]+),1
+**     jr      \$r1
+*/
+__m128d
+foo2_hi (__m256d x)
+{
+  return __lasx_extract_128_hi_d (x);
+}
+
+/*
+**foo3_lo:
+**     vld     (\$vr[0-9]+),\$r4,0
+**     vpickve2gr.du   \$r4,(\$vr[0-9]+),0
+**     vpickve2gr.du   \$r5,(\$vr[0-9]+),1
+**     jr      \$r1
+*/
+__m128i
+foo3_lo (__m256i x)
+{
+  return __lasx_extract_128_lo (x);
+}
+
+/*
+**foo3_hi:
+**     xvld    (\$xr[0-9]+),\$r4,0
+**     xvpermi.d       (\$xr[0-9]+),(\$xr[0-9]+),0xe
+**     vpickve2gr.du   \$r4,(\$vr[0-9]+),0
+**     vpickve2gr.du   \$r5,(\$vr[0-9]+),1
+**     jr      \$r1
+*/
+__m128i
+foo3_hi (__m256i x)
+{
+  return __lasx_extract_128_hi (x);
+}
diff --git 
a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c
new file mode 100644
index 00000000000..ce5abf94f4f
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c
@@ -0,0 +1,97 @@
+/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+
+extern void abort (void);
+int
+main ()
+{
+  __m128i __m128i_op0, __m128i_op1, __m128i_out;
+  __m128 __m128_op0, __m128_op1, __m128_out;
+  __m128d __m128d_op0, __m128d_op1, __m128d_out;
+
+  __m256i __m256i_op0, __m256i_result0, __m256i_result1, __m256i_out;
+  __m256 __m256_op0, __m256_result0, __m256_result1, __m256_out;
+  __m256d __m256d_op0, __m256d_result0, __m256d_result1, __m256d_out;
+
+  //__m256_op0 = {1,2,3,4,5,6,7,8}, __m128_op0 ={9,9,9,9};
+  *((int *)&__m256_op0[7]) = 0x41000000;
+  *((int *)&__m256_op0[6]) = 0x40e00000;
+  *((int *)&__m256_op0[5]) = 0x40c00000;
+  *((int *)&__m256_op0[4]) = 0x40a00000;
+  *((int *)&__m256_op0[3]) = 0x40800000;
+  *((int *)&__m256_op0[2]) = 0x40400000;
+  *((int *)&__m256_op0[1]) = 0x40000000;
+  *((int *)&__m256_op0[0]) = 0x3f800000;
+  *((int *)&__m128_op0[3]) = 0x41100000;
+  *((int *)&__m128_op0[2]) = 0x41100000;
+  *((int *)&__m128_op0[1]) = 0x41100000;
+  *((int *)&__m128_op0[0]) = 0x41100000;
+  *((int *)&__m256_result0[7]) = 0x41000000;
+  *((int *)&__m256_result0[6]) = 0x40e00000;
+  *((int *)&__m256_result0[5]) = 0x40c00000;
+  *((int *)&__m256_result0[4]) = 0x40a00000;
+  *((int *)&__m256_result0[3]) = 0x41100000;
+  *((int *)&__m256_result0[2]) = 0x41100000;
+  *((int *)&__m256_result0[1]) = 0x41100000;
+  *((int *)&__m256_result0[0]) = 0x41100000;
+  *((int *)&__m256_result1[7]) = 0x41100000;
+  *((int *)&__m256_result1[6]) = 0x41100000;
+  *((int *)&__m256_result1[5]) = 0x41100000;
+  *((int *)&__m256_result1[4]) = 0x41100000;
+  *((int *)&__m256_result1[3]) = 0x40800000;
+  *((int *)&__m256_result1[2]) = 0x40400000;
+  *((int *)&__m256_result1[1]) = 0x40000000;
+  *((int *)&__m256_result1[0]) = 0x3f800000;
+  __m256_out = __lasx_insert_128_lo_s (__m256_op0, __m128_op0);
+  ASSERTEQ_32 (__LINE__, __m256_result0, __m256_out);
+  __m256_out = __lasx_insert_128_hi_s (__m256_op0, __m128_op0);
+  ASSERTEQ_32 (__LINE__, __m256_result1, __m256_out);
+
+  //__m256i_op0 ={1,2,3,4},__m128i_op0={5,6},__m128i_op1={7,8};
+  *((unsigned long *)&__m256i_op0[3]) = 0x4;
+  *((unsigned long *)&__m256i_op0[2]) = 0x3;
+  *((unsigned long *)&__m256i_op0[1]) = 0x2;
+  *((unsigned long *)&__m256i_op0[0]) = 0x1;
+  *((unsigned long *)&__m128i_op0[1]) = 0x6;
+  *((unsigned long *)&__m128i_op0[0]) = 0x5;
+  *((unsigned long *)&__m128i_op1[1]) = 0x8;
+  *((unsigned long *)&__m128i_op1[0]) = 0x7;
+  *((unsigned long *)&__m256i_result0[3]) = 0x4;
+  *((unsigned long *)&__m256i_result0[2]) = 0x3;
+  *((unsigned long *)&__m256i_result0[1]) = 0x6;
+  *((unsigned long *)&__m256i_result0[0]) = 0x5;
+  *((unsigned long *)&__m256i_result1[3]) = 0x8;
+  *((unsigned long *)&__m256i_result1[2]) = 0x7;
+  *((unsigned long *)&__m256i_result1[1]) = 0x2;
+  *((unsigned long *)&__m256i_result1[0]) = 0x1;
+  __m256i_out = __lasx_insert_128_lo (__m256i_op0, __m128i_op0);
+  ASSERTEQ_64 (__LINE__, __m256i_result0, __m256i_out);
+  __m256i_out = __lasx_insert_128_hi (__m256i_op0, __m128i_op1);
+  ASSERTEQ_64 (__LINE__, __m256i_result1, __m256i_out);
+
+  //__m256d_op0 ={1,2,3,4},__m128d_op0={5,6},__m128d_op1={7,8};
+  *((unsigned long *)&__m256d_op0[3]) = 0x4010000000000000;
+  *((unsigned long *)&__m256d_op0[2]) = 0x4008000000000000;
+  *((unsigned long *)&__m256d_op0[1]) = 0x4000000000000000;
+  *((unsigned long *)&__m256d_op0[0]) = 0x3ff0000000000000;
+  *((unsigned long *)&__m128d_op0[1]) = 0x4018000000000000;
+  *((unsigned long *)&__m128d_op0[0]) = 0x4014000000000000;
+  *((unsigned long *)&__m128d_op1[1]) = 0x4020000000000000;
+  *((unsigned long *)&__m128d_op1[0]) = 0x401c000000000000;
+  *((unsigned long *)&__m256d_result0[3]) = 0x4010000000000000;
+  *((unsigned long *)&__m256d_result0[2]) = 0x4008000000000000;
+  *((unsigned long *)&__m256d_result0[1]) = 0x4018000000000000;
+  *((unsigned long *)&__m256d_result0[0]) = 0x4014000000000000;
+  *((unsigned long *)&__m256d_result1[3]) = 0x4020000000000000;
+  *((unsigned long *)&__m256d_result1[2]) = 0x401c000000000000;
+  *((unsigned long *)&__m256d_result1[1]) = 0x4000000000000000;
+  *((unsigned long *)&__m256d_result1[0]) = 0x3ff0000000000000;
+  __m256d_out = __lasx_insert_128_lo_d (__m256d_op0, __m128d_op0);
+  ASSERTEQ_64 (__LINE__, __m256d_result0, __m256d_out);
+  __m256d_out = __lasx_insert_128_hi_d (__m256d_op0, __m128d_op1);
+  ASSERTEQ_64 (__LINE__, __m256d_result1, __m256d_out);
+
+  return 0;
+}
diff --git 
a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c 
b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c
new file mode 100644
index 00000000000..326c855f229
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c
@@ -0,0 +1,95 @@
+/* { dg-do compile { target { loongarch64*-*-* } } } */
+/* { dg-options "-mabi=lp64d -O2 -mlasx" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <lasxintrin.h>
+
+/*
+**foo1:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,0
+**     xvld    (\$xr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x30
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256
+foo1 (__m256 x, __m128 y)
+{
+  return __builtin_lasx_insert_128_lo_s (x, y);
+}
+
+/*
+**foo2:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,0
+**     xvld    (\$xr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x02
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256
+foo2 (__m256 x, __m128 y)
+{
+  return __builtin_lasx_insert_128_hi_s (x, y);
+}
+
+/*
+**foo3:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,0
+**     xvld    (\$xr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x30
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256d
+foo3 (__m256d x, __m128d y)
+{
+  return __builtin_lasx_insert_128_lo_d (x, y);
+}
+
+/*
+**foo4:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,0
+**     xvld    (\$xr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x02
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256d
+foo4 (__m256d x, __m128d y)
+{
+  return __builtin_lasx_insert_128_hi_d (x, y);
+}
+
+/*
+**foo5:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,0
+**     xvld    (\$xr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x30
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256i
+foo5 (__m256i x, __m128i y)
+{
+  return __builtin_lasx_insert_128_lo (x, y);
+}
+
+/*
+**foo6:
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r6,0
+**     xvld    (\$xr[0-9]+),\$r5,0
+**     vinsgr2vr.d     (\$vr[0-9]+),\$r7,1
+**     xvpermi.q       (\$xr[0-9]+),(\$xr[0-9]+),0x02
+**     xvst    (\$xr[0-9]+),\$r4,0
+**     jr      \$r1
+*/
+__m256i
+foo6 (__m256i x, __m128i y)
+{
+  return __builtin_lasx_insert_128_hi (x, y);
+}

Re: [PATCH v1] LoongArch: Add builtin interfaces for 128 and 256 vector conversions.

Reply via email to