Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin
if the arguments are better suited to it. This helps us avoid copying data
between lanes before operation.

E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following:

uint16x8_t
foo(const uint8x16_t s) {
        const uint8x16_t f0 = vdupq_n_u8(4);
        return vmull_u8(vget_high_u8(s), vget_high_u8(f0));
}

gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro.
        Covers every LO_HI_PAIR.
        (aarch64_get_highpart_builtin): New function. Get the highpart builtin
        paired with the input FCODE.
        (LO_HI_PAIR):
        (aarch64_self_concat_vec_cst): New function. Concatenate a
        VECTOR_CST with itself.
        (aarch64_object_of_bfr): New function. Helper to check arguments
        for vector highparts.
        (aarch64_fold_lo_call_to_hi): New function.
        (aarch64_general_gimple_fold_builtin): Add cases for the lowpart
        builtins.
        * config/aarch64/aarch64-builtin-pairs.def: New file. Declare
        pairings of lowpart/highpart builtins.

gcc/testsuite/ChangeLog:
        * gcc.target/aarch64/simd/vabal_combine.c: Test changed to
        pass after earlier builtin fold.
        * gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
        * gcc.target/aarch64/simd/fold_to_highpart_2.c: New test.
        * gcc.target/aarch64/simd/fold_to_highpart_3.c: New test.
        * gcc.target/aarch64/simd/fold_to_highpart_4.c: New test.
        * gcc.target/aarch64/simd/fold_to_highpart_5.c: New test.
---
 gcc/config/aarch64/aarch64-builtin-pairs.def  |  77 ++
 gcc/config/aarch64/aarch64-builtins.cc        | 232 ++++++
 .../aarch64/simd/fold_to_highpart_1.c         | 708 ++++++++++++++++++
 .../aarch64/simd/fold_to_highpart_2.c         |  82 ++
 .../aarch64/simd/fold_to_highpart_3.c         |  80 ++
 .../aarch64/simd/fold_to_highpart_4.c         |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c         |  71 ++
 .../gcc.target/aarch64/simd/vabal_combine.c   |  12 +-
 8 files changed, 1333 insertions(+), 6 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c

diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def 
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 00000000000..d3ca69a1887
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,77 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* LO/HI widenable integer modes.  */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \
+  LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+/* LO/HI Single/Half integer modes.  */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define UNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR (UNOP_sxtlv8hi,  UNOP_vec_unpacks_hi_v16qi) \
+  LO_HI_PAIR (UNOP_sxtlv4si,  UNOP_vec_unpacks_hi_v8hi) \
+  LO_HI_PAIR (UNOP_sxtlv2di,  UNOP_vec_unpacks_hi_v4si) \
+  LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+  LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+  LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpacku_hi_v4si)
+
+#define BINOP_LONG_LH_PAIRS \
+  LO_HI_PAIR_V_WI (BINOP,  saddl, saddl2) \
+  LO_HI_PAIR_V_WI (BINOPU, uaddl, uaddl2) \
+  LO_HI_PAIR_V_WI (BINOP,  ssubl, ssubl2) \
+  LO_HI_PAIR_V_WI (BINOPU, usubl, usubl2) \
+  LO_HI_PAIR_V_WI (BINOP,  sabdl, sabdl2) \
+  LO_HI_PAIR_V_WI (BINOPU, uabdl, uabdl2) \
+  LO_HI_PAIR_V_WI (BINOP,  intrinsic_vec_smult_lo_, vec_widen_smult_hi_) \
+  LO_HI_PAIR_V_WI (BINOPU, intrinsic_vec_umult_lo_, vec_widen_umult_hi_) \
+  LO_HI_PAIR_V_HSI (BINOP,  sqdmull, sqdmull2)
+
+#define BINOP_LONG_N_LH_PAIRS \
+  LO_HI_PAIR_V_HSI (BINOP,  smull_n, smull_hi_n) \
+  LO_HI_PAIR_V_HSI (BINOPU, umull_n, umull_hi_n) \
+  LO_HI_PAIR_V_HSI (BINOP,  sqdmull_n, sqdmull2_n) \
+
+#define BINOP_WIDE_LH_PAIRS \
+  LO_HI_PAIR_V_WI (BINOP,  ssubw, ssubw2) \
+  LO_HI_PAIR_V_WI (BINOPU, usubw, usubw2) \
+  LO_HI_PAIR_V_WI (BINOP,  saddw, saddw2) \
+  LO_HI_PAIR_V_WI (BINOPU, uaddw, uaddw2)
+
+#define TERNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR_V_WI (TERNOP,  smlal, smlal_hi) \
+  LO_HI_PAIR_V_WI (TERNOPU, umlal, umlal_hi) \
+  LO_HI_PAIR_V_WI (TERNOP,  smlsl, smlsl_hi) \
+  LO_HI_PAIR_V_WI (TERNOPU, umlsl, umlsl_hi) \
+  LO_HI_PAIR_V_WI (TERNOP,  sabal, sabal2) \
+  LO_HI_PAIR_V_WI (TERNOPU, uabal, uabal2) \
+  LO_HI_PAIR_V_HSI (TERNOP, sqdmlal, sqdmlal2) \
+  LO_HI_PAIR_V_HSI (TERNOP, sqdmlsl, sqdmlsl2)
+
+#define TERNOP_LONG_N_LH_PAIRS \
+  LO_HI_PAIR_V_HSI (TERNOP,  smlal_n, smlal_hi_n) \
+  LO_HI_PAIR_V_HSI (TERNOPU, umlal_n, umlal_hi_n) \
+  LO_HI_PAIR_V_HSI (TERNOP,  smlsl_n, smlsl_hi_n) \
+  LO_HI_PAIR_V_HSI (TERNOPU, umlsl_n, umlsl_hi_n) \
+  LO_HI_PAIR_V_HSI (TERNOP,  sqdmlal_n, sqdmlal2_n) \
+  LO_HI_PAIR_V_HSI (TERNOP,  sqdmlsl_n, sqdmlsl2_n)
diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 128cc365d3d..efb35717e83 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -48,6 +48,8 @@
 #include "attribs.h"
 #include "gimple-fold.h"
 #include "builtins.h"
+#include "tree-pass.h"
+#include "tree-vector-builder.h"
 #include "aarch64-builtins.h"
 
 using namespace aarch64;
@@ -737,6 +739,16 @@ static aarch64_simd_builtin_datum 
aarch64_simd_builtin_data[] = {
   VGET_HIGH_BUILTIN(u64) \
   VGET_HIGH_BUILTIN(bf16)
 
+#include "aarch64-builtin-pairs.def"
+
+#define LO_HI_PAIRINGS \
+  UNOP_LONG_LH_PAIRS \
+  BINOP_WIDE_LH_PAIRS \
+  BINOP_LONG_LH_PAIRS \
+  BINOP_LONG_N_LH_PAIRS \
+  TERNOP_LONG_LH_PAIRS \
+  TERNOP_LONG_N_LH_PAIRS \
+
 typedef struct
 {
   const char *name;
@@ -4982,6 +4994,148 @@ aarch64_gimple_fold_pragma_builtin
     }
 }
 
+static inline tree
+aarch64_get_highpart_builtin (unsigned int fcode_lo)
+{
+#undef LO_HI_PAIR
+#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:   \
+  return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_##B];
+
+  switch (fcode_lo)
+    {
+      LO_HI_PAIRINGS
+      default:
+       return NULL_TREE;
+    }
+}
+
+/* Build and return a new VECTOR_CST that is the concatenation of
+   VEC_IN with itself.  */
+static tree
+aarch64_self_concat_vec_cst (tree vec_in)
+{
+  gcc_assert ((TREE_CODE (vec_in) == VECTOR_CST));
+  unsigned HOST_WIDE_INT nelts
+    = VECTOR_CST_NELTS (vec_in).to_constant ();
+
+  tree out_type = build_vector_type (TREE_TYPE (TREE_TYPE (vec_in)),
+                                    nelts * 2);
+
+  /* Avoid decoding/encoding if the encoding won't change.  */
+  if (VECTOR_CST_DUPLICATE_P (vec_in))
+    {
+      tree vec_out = make_vector (exact_log2
+                                (VECTOR_CST_NPATTERNS (vec_in)), 1);
+      unsigned int encoded_size
+       = vector_cst_encoded_nelts (vec_in) * sizeof (tree);
+
+      memcpy (VECTOR_CST_ENCODED_ELTS (vec_out),
+             VECTOR_CST_ENCODED_ELTS (vec_in), encoded_size);
+
+      TREE_TYPE (vec_out) = out_type;
+      return vec_out;
+    }
+
+  tree_vector_builder vec_out (out_type, nelts, 1);
+  for (unsigned i = 0; i < nelts * 2; i++)
+    vec_out.quick_push (VECTOR_CST_ELT (vec_in, i % nelts));
+
+  return vec_out.build ();
+}
+
+/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a
+   BIT_FIELD_REF with SIZE and OFFSET, return the object of the
+   BIT_FIELD_REF.  Otherwise, return NULL_TREE.  */
+static tree
+aarch64_object_of_bfr (tree arg, unsigned HOST_WIDE_INT size,
+                      unsigned HOST_WIDE_INT offset)
+{
+  if (TREE_CODE (arg) != SSA_NAME)
+    return NULL_TREE;
+
+  gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (arg));
+
+  if (!stmt)
+    return NULL_TREE;
+
+  if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
+    return NULL_TREE;
+
+  tree bf_ref = gimple_assign_rhs1 (stmt);
+
+  if (bit_field_size (bf_ref).to_constant () != size
+      || bit_field_offset (bf_ref).to_constant () != offset)
+    return NULL_TREE;
+
+  return TREE_OPERAND (bf_ref, 0);
+}
+
+/*  Prefer to use the highpart builtin when:
+
+    1) All lowpart arguments are references to the highparts of other
+    vectors.
+
+    2) For calls with two lowpart arguments, if either refers to a
+    vector highpart and the other is a VECTOR_CST.  We can copy the
+    VECTOR_CST to 128b in this case.  */
+static bool
+aarch64_fold_lo_call_to_hi (tree arg_0, tree arg_1, tree *out_0,
+                           tree *out_1)
+{
+  /* Punt until as late as possible:
+
+     1) By folding away BIT_FIELD_REFs we remove information about the
+     operands that may be useful to other optimizers.
+
+     2) For simplicity, we'd like the expression
+
+       x = BIT_FIELD_REF<a, 64, 64>
+
+     to imply that A is not a VECTOR_CST.  This assumption is unlikely
+     to hold before constant propagation/folding.  */
+  if (!(cfun->curr_properties & PROP_last_full_fold))
+    return false;
+
+  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
+
+  tree hi_arg_0 = aarch64_object_of_bfr (arg_0, 64, offset);
+  tree hi_arg_1 = aarch64_object_of_bfr (arg_1, 64, offset);
+  if (!hi_arg_0)
+    {
+      if (!hi_arg_1 || TREE_CODE (arg_0) != VECTOR_CST)
+       return false;
+      hi_arg_0 = aarch64_self_concat_vec_cst (arg_0);
+    }
+  else if (!hi_arg_1)
+    {
+      if (TREE_CODE (arg_1) != VECTOR_CST)
+       return false;
+      hi_arg_1 = aarch64_self_concat_vec_cst (arg_1);
+    }
+
+  *out_0 = hi_arg_0;
+  *out_1 = hi_arg_1;
+  return true;
+}
+
+static bool
+aarch64_fold_lo_call_to_hi (tree arg_in, tree *out)
+{
+  if (!(cfun->curr_properties & PROP_last_full_fold))
+    return false;
+  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
+
+  tree hi_arg = aarch64_object_of_bfr (arg_in, 64, offset);
+  if (!hi_arg)
+    return false;
+
+  *out = hi_arg;
+  return true;
+}
+
+#undef LO_HI_PAIR
+#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
@@ -5168,6 +5322,84 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
            }
          break;
        }
+      break;
+    UNOP_LONG_LH_PAIRS
+      {
+       tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+       gcc_assert (nargs == 1 && builtin_hi != NULL_TREE);
+
+       tree hi_arg;
+       if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg))
+         break;
+       new_stmt = gimple_build_call (builtin_hi, 1, hi_arg);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+      }
+    break;
+    BINOP_LONG_LH_PAIRS
+      {
+       tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+       gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
+
+       tree hi_arg_0, hi_arg_1;
+       if (!aarch64_fold_lo_call_to_hi (args[0], args[1], &hi_arg_0,
+                                        &hi_arg_1))
+         break;
+       new_stmt = gimple_build_call (builtin_hi, 2,
+                                     hi_arg_0, hi_arg_1);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+      }
+    break;
+    BINOP_LONG_N_LH_PAIRS
+      {
+       tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+       gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
+
+       tree hi_arg;
+       if (!aarch64_fold_lo_call_to_hi (args[0], &hi_arg))
+         break;
+       new_stmt = gimple_build_call (builtin_hi, 2, hi_arg, args[1]);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+      }
+    break;
+    TERNOP_LONG_LH_PAIRS
+      {
+       tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+       gcc_assert (nargs == 3 && builtin_hi != NULL_TREE);
+
+       tree hi_arg_0, hi_arg_1;
+       if (!aarch64_fold_lo_call_to_hi (args[1], args[2], &hi_arg_0,
+                                        &hi_arg_1))
+         break;
+       new_stmt = gimple_build_call (builtin_hi, 3, args[0],
+                                     hi_arg_0, hi_arg_1);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+      }
+    break;
+    TERNOP_LONG_N_LH_PAIRS
+      {
+       tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+       gcc_assert (nargs == 3 && builtin_hi != NULL_TREE);
+
+       tree hi_arg;
+       if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg))
+         break;
+       new_stmt = gimple_build_call (builtin_hi, 3, args[0], hi_arg,
+                                     args[2]);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+      }
+    break;
+    BINOP_WIDE_LH_PAIRS
+      {
+       tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+       gcc_assert (nargs == 2 && builtin_hi != NULL_TREE);
+
+       tree hi_arg;
+       if (!aarch64_fold_lo_call_to_hi (args[1], &hi_arg))
+         break;
+       new_stmt = gimple_build_call (builtin_hi, 2, args[0], hi_arg);
+       gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+      }
+    break;
     case AARCH64_SIMD_BUILTIN_LANE_CHECK:
       if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
        {
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
new file mode 100644
index 00000000000..f5804da4f2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
@@ -0,0 +1,708 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -save-temps" } */
+
+#include "arm_neon.h"
+
+/* Prefer the highpart variant of a builtin when it's arguments
+   are vector highparts.  */
+
+#ifndef TEST_UN_HIGHPARTS
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a)        \
+  {                                                  \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a)); \
+  }
+#endif
+
+#ifndef TEST_BIN_W_HIGHPARTS
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)  \
+  RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+  {                                                      \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b));  \
+  }
+#endif
+
+#ifndef TEST_BIN_N_HIGHPARTS
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a)              \
+  {                                                        \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a), a[1]); \
+  }
+#endif
+
+#ifndef TEST_TERN_N_HIGHPARTS
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)      \
+  RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b)      \
+  {                                                           \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), b[1]); \
+  }
+#endif
+
+#ifndef TEST_BIN_HIGHPARTS
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a, INTYPE b)         \
+  {                                                             \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a),             \
+                            vget_high_ ## SUFF (b));            \
+  }
+#endif
+
+#ifndef TEST_TERN_HIGHPARTS
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF)   \
+  RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b, INTYPE c) \
+  {                                                                \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b),             \
+                            vget_high_ ## SUFF (c));               \
+  }
+#endif
+
+#define TEST_UN_VQW(FN) \
+  TEST_UN_HIGHPARTS (FN, int16x8_t,  int8x16_t,  s8)  \
+  TEST_UN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, u8)  \
+  TEST_UN_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_UN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_UN_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_UN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_BIN_VQW(FN)                                           \
+  TEST_BIN_HIGHPARTS (FN, int16x8_t,  int8x16_t,  int8x8_t,   s8)  \
+  TEST_BIN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t,  u8)  \
+  TEST_BIN_HIGHPARTS (FN, int32x4_t,  int16x8_t,  int16x4_t,  s16) \
+  TEST_BIN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \
+  TEST_BIN_HIGHPARTS (FN, int64x2_t,  int32x4_t,  int32x2_t,  s32) \
+  TEST_BIN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32)
+
+#define TEST_BIN_N_VQW(FN)                               \
+  TEST_BIN_N_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_BIN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_BIN_N_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_BIN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_BIN_W_VQW(FN)                               \
+  TEST_BIN_W_HIGHPARTS (FN, int16x8_t,  int8x16_t,   s8) \
+  TEST_BIN_W_HIGHPARTS (FN, uint16x8_t, uint8x16_t,  u8) \
+  TEST_BIN_W_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_BIN_W_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_BIN_W_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_BIN_W_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_TERN_N_VQW(FN)                               \
+  TEST_TERN_N_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_TERN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_TERN_N_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_TERN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_TERN_VQW(FN)                                           \
+  TEST_TERN_HIGHPARTS (FN, int16x8_t,  int8x16_t,  int8x8_t,   s8)  \
+  TEST_TERN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t,  u8)  \
+  TEST_TERN_HIGHPARTS (FN, int32x4_t,  int16x8_t,  int16x4_t,  s16) \
+  TEST_TERN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \
+  TEST_TERN_HIGHPARTS (FN, int64x2_t,  int32x4_t,  int32x2_t,  s32) \
+  TEST_TERN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32)
+
+#define TEST_VQDMULL                                                 \
+  TEST_BIN_HIGHPARTS (vqdmull, int32x4_t, int16x8_t, int16x4_t, s16) \
+  TEST_BIN_HIGHPARTS (vqdmull, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQDMULL_N                                        \
+  TEST_BIN_N_HIGHPARTS (vqdmull_n, int32x4_t, int16x8_t, s16) \
+  TEST_BIN_N_HIGHPARTS (vqdmull_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VQMLAL                                                   \
+  TEST_TERN_HIGHPARTS (vqdmlal, int32x4_t, int16x8_t, int16x4_t, s16) \
+  TEST_TERN_HIGHPARTS (vqdmlal, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQMLAL_N                                          \
+  TEST_TERN_N_HIGHPARTS (vqdmlal_n, int32x4_t, int16x8_t, s16) \
+  TEST_TERN_N_HIGHPARTS (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VQMLSL                                                   \
+  TEST_TERN_HIGHPARTS (vqdmlsl, int32x4_t, int16x8_t, int16x4_t, s16) \
+  TEST_TERN_HIGHPARTS (vqdmlsl, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQMLSL_N                                          \
+  TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int32x4_t, int16x8_t, s16) \
+  TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VMOVL \
+  TEST_UN_VQW (vmovl)
+
+#define TEST_VMULL \
+  TEST_BIN_VQW (vmull)
+
+#define TEST_VMULL_N \
+  TEST_BIN_N_VQW (vmull_n)
+
+#define TEST_VADDL \
+  TEST_BIN_VQW (vaddl)
+
+#define TEST_VSUBL \
+  TEST_BIN_VQW (vsubl)
+
+#define TEST_VMLAL \
+  TEST_TERN_VQW (vmlal)
+
+#define TEST_VMLAL_N \
+  TEST_TERN_N_VQW (vmlal_n)
+
+#define TEST_VMLSL \
+  TEST_TERN_VQW (vmlsl)
+
+#define TEST_VMLSL_N \
+  TEST_TERN_N_VQW (vmlsl_n)
+
+#define TEST_VABDL \
+  TEST_BIN_VQW (vabdl)
+
+#define TEST_VABAL \
+  TEST_TERN_VQW (vabal)
+
+#define TEST_VSUBW \
+  TEST_BIN_W_VQW (vsubw)
+
+#define TEST_VADDW \
+  TEST_BIN_W_VQW (vaddw)
+
+/*
+** test_vmovl_s8:
+**     sxtl2   v0\.8h, v0\.16b
+**     ret
+*/
+
+/*
+** test_vmovl_u8:
+**     uxtl2   v0\.8h, v0\.16b
+**     ret
+*/
+
+/*
+** test_vmovl_s16:
+**     sxtl2   v0\.4s, v0\.8h
+**     ret
+*/
+
+/*
+** test_vmovl_u16:
+**     uxtl2   v0\.4s, v0\.8h
+**     ret
+*/
+
+/*
+** test_vmovl_s32:
+**     sxtl2   v0\.2d, v0\.4s
+**     ret
+*/
+
+/*
+** test_vmovl_u32:
+**     uxtl2   v0\.2d, v0\.4s
+**     ret
+*/
+
+TEST_VMOVL
+
+/*
+** test_vmull_s8:
+**     smull2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vmull_u8:
+**     umull2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vmull_s16:
+**     smull2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vmull_u16:
+**     umull2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vmull_s32:
+**     smull2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+/*
+** test_vmull_u32:
+**     umull2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+TEST_VMULL
+
+/*
+** test_vmull_n_s16:
+**     smull2  v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmull_n_u16:
+**     umull2  v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmull_n_s32:
+**     smull2  v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+**     ret
+*/
+
+/*
+** test_vmull_n_u32:
+**     umull2  v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VMULL_N
+
+/*
+** test_vaddl_s8:
+**     saddl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vaddl_u8:
+**     uaddl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vaddl_s16:
+**     saddl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vaddl_u16:
+**     uaddl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vaddl_s32:
+**     saddl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+/*
+** test_vaddl_u32:
+**     uaddl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+TEST_VADDL
+
+/*
+** test_vsubl_s8:
+**     ssubl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubl_u8:
+**     usubl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubl_s16:
+**     ssubl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubl_u16:
+**     usubl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubl_s32:
+**     ssubl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+/*
+** test_vsubl_u32:
+**     usubl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+TEST_VSUBL
+
+/*
+** test_vabal_s8:
+**     sabal2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vabal_u8:
+**     uabal2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vabal_s16:
+**     sabal2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vabal_u16:
+**     uabal2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vabal_s32:
+**     sabal2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+/*
+** test_vabal_u32:
+**     uabal2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VABAL
+
+/*
+** test_vsubw_s8:
+**     ssubw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubw_u8:
+**     usubw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubw_s16:
+**     ssubw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubw_u16:
+**     usubw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubw_s32:
+**     ssubw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+/*
+** test_vsubw_u32:
+**     usubw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+TEST_VSUBW
+
+/*
+** test_vaddw_s8:
+**     saddw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vaddw_u8:
+**     uaddw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vaddw_s16:
+**     saddw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vaddw_u16:
+**     uaddw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vaddw_s32:
+**     saddw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+/*
+** test_vaddw_u32:
+**     uaddw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+TEST_VADDW
+
+/*
+** test_vabdl_s8:
+**     sabdl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vabdl_u8:
+**     uabdl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vabdl_s16:
+**     sabdl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vabdl_u16:
+**     uabdl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vabdl_s32:
+**     sabdl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+/*
+** test_vabdl_u32:
+**     uabdl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+TEST_VABDL
+
+/*
+** test_vmlal_s8:
+**     smlal2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vmlal_u8:
+**     umlal2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vmlal_s16:
+**     smlal2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vmlal_u16:
+**     umlal2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vmlal_s32:
+**     smlal2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+/*
+** test_vmlal_u32:
+**     umlal2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VMLAL
+
+/*
+** test_vmlal_n_s16:
+**     smlal2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlal_n_u16:
+**     umlal2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlal_n_s32:
+**     smlal2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+/*
+** test_vmlal_n_u32:
+**     umlal2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VMLAL_N
+
+/*
+** test_vmlsl_s8:
+**     smlsl2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vmlsl_u8:
+**     umlsl2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vmlsl_s16:
+**     smlsl2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vmlsl_u16:
+**     umlsl2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vmlsl_s32:
+**     smlsl2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+/*
+** test_vmlsl_u32:
+**     umlsl2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VMLSL
+
+/*
+** test_vmlsl_n_s16:
+**     smlsl2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlsl_n_u16:
+**     umlsl2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlsl_n_s32:
+**     smlsl2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+/*
+** test_vmlsl_n_u32:
+**     umlsl2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VMLSL_N
+
+/*
+** test_vqdmull_s16:
+**     sqdmull2        v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vqdmull_s32:
+**     sqdmull2        v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+TEST_VQDMULL
+
+/*
+** test_vqdmull_n_s16:
+**     sqdmull2        v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vqdmull_n_s32:
+**     sqdmull2        v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VQDMULL_N
+
+/*
+** test_vqdmlal_s16:
+**     sqdmlal2        v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vqdmlal_s32:
+**     sqdmlal2        v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VQMLAL
+
+/*
+** test_vqdmlal_n_s16:
+**     sqdmlal2        v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vqdmlal_n_s32:
+**     sqdmlal2        v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VQMLAL_N
+
+/*
+** test_vqdmlsl_s16:
+**     sqdmlsl2        v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vqdmlsl_s32:
+**     sqdmlsl2        v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VQMLSL
+
+/*
+** test_vqdmlsl_n_s16:
+**     sqdmlsl2        v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vqdmlsl_n_s32:
+**     sqdmlsl2        v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VQMLSL_N
+
+/* { dg-final { check-function-bodies "**" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
new file mode 100644
index 00000000000..8315b386655
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
@@ -0,0 +1,82 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -save-temps" } */
+
+/* Don't fold to the highpart builtin unless at least one argument is
+   a true highpart (not that of a VECTOR_CST).  */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_ ## FN ## _ ## SUFF ()                \
+  {                                                  \
+    INTYPE a = vdupq_n_ ## SUFF (0x1A);              \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a)); \
+  }
+
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a)          \
+  {                                                     \
+    INTYPE b = vdupq_n_ ## SUFF (0x1A);                 \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b)); \
+  }
+
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)     \
+  RETTYPE test_ ## FN ## _ ## SUFF (INTYPE c)               \
+  {                                                         \
+    INTYPE a = vdupq_n_ ## SUFF (0x1A);                     \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a), c[1]);  \
+  }                                                         \
+
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)      \
+  RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a)                \
+  {                                                           \
+    INTYPE b = vdupq_n_ ## SUFF (0x1A);                       \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), b[1]); \
+  }                                                           \
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  RETTYPE test_ ## FN ## _ ## SUFF (H_INTYPE b)                 \
+  {                                                             \
+    INTYPE a = vdupq_n_ ## SUFF (0x1A);                         \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a), b);         \
+  }                                                             \
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, H_INTYPE b)       \
+  {                                                              \
+    INTYPE c = vdupq_n_ ## SUFF (0x1A);                          \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (c), b);       \
+  }                                                              \
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-not {uxtl2\t} } } */
+/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+
+/* { dg-final { scan-assembler-not {umull2\t} } } */
+/* { dg-final { scan-assembler-not {smull2\t} } } */
+
+/* { dg-final { scan-assembler-not {uaddl2\t} } } */
+/* { dg-final { scan-assembler-not {saddl2\t} } } */
+
+/* { dg-final { scan-assembler-not {usubl2\t} } } */
+/* { dg-final { scan-assembler-not {ssubl2\t} } } */
+
+/* { dg-final { scan-assembler-not {uabdl2\t} } } */
+/* { dg-final { scan-assembler-not {sabdl2\t} } } */
+
+/* { dg-final { scan-assembler-not {usubw2\t} } } */
+/* { dg-final { scan-assembler-not {ssubw2\t} } } */
+
+/* { dg-final { scan-assembler-not {uaddw2\t} } } */
+/* { dg-final { scan-assembler-not {saddw2\t} } } */
+
+/* { dg-final { scan-assembler-not {umlal2\t} } } */
+/* { dg-final { scan-assembler-not {smlal2\t} } } */
+
+/* { dg-final { scan-assembler-not {umlsl2\t} } } */
+/* { dg-final { scan-assembler-not {smlsl2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmull2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmlal2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmlsl2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
new file mode 100644
index 00000000000..9ede99ae58c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
@@ -0,0 +1,80 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -save-temps" } */
+
+/* PR117850 */
+
+/* For builtins with two 64b vector arguments, prefer the highpart
+   variant if one is a true highpart and the other a VECTOR_CST.  */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_a1_ ## FN ## _ ## SUFF (INTYPE a)        \
+  {                                                     \
+    INTYPE b = vdupq_n_ ## SUFF (0x1A);                 \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a),     \
+                            vget_high_ ## SUFF (b));    \
+  }
+
+#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_a2_ ## FN ## _ ## SUFF (INTYPE a)        \
+  {                                                     \
+    INTYPE b = vdupq_n_ ## SUFF (0x1A);                 \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (b),     \
+                            vget_high_ ## SUFF (a));    \
+  }
+
+#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_a1_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+  {                                                         \
+    INTYPE c = vdupq_n_ ## SUFF (0x1A);                     \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b),      \
+                            vget_high_ ## SUFF (c));        \
+  }
+
+#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_a2_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+  {                                                         \
+    INTYPE c = vdupq_n_ ## SUFF (0x1A);                     \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (c),      \
+                            vget_high_ ## SUFF (b));        \
+  }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF)              \
+  TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF)              \
+  TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-not {dup\t} } } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 6} } */
+/* { dg-final { scan-assembler-times {umull2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
new file mode 100644
index 00000000000..82c189216db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -save-temps" } */
+
+/* For builtins with two 64b vector arguments, prefer the highpart
+   variant if one is a true highpart and the other a VECTOR_CST.  */
+
+#define VEC_64b 0x1A2E4A4FFFED773E
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF)  \
+  RETTYPE test_a1_ ## FN ## _ ## SUFF (INTYPE a)         \
+  {                                                      \
+    return FN ## _ ## SUFF (vget_high_ ## SUFF (a),      \
+                            vcreate_ ## SUFF (VEC_64b)); \
+  }
+
+#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_a2_ ## FN ## _ ## SUFF (INTYPE a)        \
+  {                                                     \
+    return FN ## _ ## SUFF (vcreate_ ## SUFF (VEC_64b), \
+                            vget_high_ ## SUFF (a));    \
+  }
+
+#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_a1_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+  {                                                         \
+    return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b),      \
+                            vcreate_ ## SUFF (VEC_64b));    \
+  }
+
+#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_a2_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+  {                                                         \
+    return FN ## _ ## SUFF (a, vcreate_ ## SUFF (VEC_64b),  \
+                            vget_high_ ## SUFF (b));        \
+  }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF)              \
+  TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF)              \
+  TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-not {dup\t} } } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 6} } */
+/* { dg-final { scan-assembler-times {umull2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
new file mode 100644
index 00000000000..b85cdef3517
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */
+
+#include "arm_neon.h"
+
+#define VEC_CST_u8     0x0102030405060708
+#define VEC_CST_DUP_u8 0x0102030401020304
+
+#define VEC_CST_u16     0x0001000200030004
+#define VEC_CST_DUP_u16 0x0001000200010002
+
+#define VEC_CST_u32     0x0000000100000002
+#define VEC_CST_DUP_u32 0x0000000100000001
+
+/* Duplicate the 64b VECTOR_CST, allowing us to fold the builtin call
+   to it's highpart variant.  */
+
+uint16x8_t
+test_u8 (uint8x16_t a)
+{
+  const uint8x8_t b = vcreate_u8 (VEC_CST_u8);
+  return vmull_u8 (vget_high_u8 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 
4, 3, 2, 1 \}" 1 "optimized" } } */
+
+uint16x8_t
+test_dup_u8 (uint8x16_t a)
+{
+  const uint8x8_t b = vcreate_u8 (VEC_CST_DUP_u8);
+  return vmull_u8 (vget_high_u8 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 
4, 3, 2, 1 \}" 1 "optimized" } } */
+
+uint32x4_t
+test_u16 (uint16x8_t a)
+{
+  const uint16x4_t b = vcreate_u16 (VEC_CST_u16);
+  return vmull_u16 (vget_high_u16 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 4, 3, 2, 1, 4, 3, 2, 1 \}" 1 
"optimized" } } */
+
+uint32x4_t
+test_dup_u16 (uint16x8_t a)
+{
+  const uint16x4_t b = vcreate_u16 (VEC_CST_DUP_u16);
+  return vmull_u16 (vget_high_u16 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 2, 1, 2, 1, 2, 1, 2, 1 \}" 1 
"optimized" } } */
+
+uint64x2_t
+test_u32 (uint32x4_t a)
+{
+  const uint32x2_t b = vcreate_u32 (VEC_CST_u32);
+  return vmull_u32 (vget_high_u32 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 2, 1, 2, 1 \}" 1 "optimized" } } */
+
+uint64x2_t
+test_dup_u32 (uint32x4_t a)
+{
+  const uint32x2_t b = vcreate_u32 (VEC_CST_DUP_u32);
+  return vmull_u32 (vget_high_u32 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 1, 1, 1, 1 \}" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
index c51878aa226..671e47e00a1 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
@@ -6,7 +6,7 @@
 
 /*
 ** test_vabal_s8:
-**      sabal2 v0.8h, v2.16b, v1.16b
+**      sabal2 v0.8h, v1.16b, v2.16b
 **      ret
 */
 int16x8_t
@@ -17,7 +17,7 @@ test_vabal_s8 (int16x8_t sadv, int8x16_t pv, int8x16_t sv)
 
 /*
 ** test_vabal_u8:
-**      uabal2 v0.8h, v2.16b, v1.16b
+**      uabal2 v0.8h, v1.16b, v2.16b
 **      ret
 */
 uint16x8_t
@@ -28,7 +28,7 @@ test_vabal_u8 (uint16x8_t sadv, uint8x16_t pv, uint8x16_t sv)
 
 /*
 ** test_vabal_s16:
-**      sabal2 v0.4s, v2.8h, v1.8h
+**      sabal2 v0.4s, v1.8h, v2.8h
 **      ret
 */
 int32x4_t
@@ -39,7 +39,7 @@ test_vabal_s16 (int32x4_t sadv, int16x8_t pv, int16x8_t sv)
 
 /*
 ** test_vabal_u16:
-**      uabal2 v0.4s, v2.8h, v1.8h
+**      uabal2 v0.4s, v1.8h, v2.8h
 **      ret
 */
 uint32x4_t
@@ -50,7 +50,7 @@ test_vabal_u16 (uint32x4_t sadv, uint16x8_t pv, uint16x8_t sv)
 
 /*
 ** test_vabal_s32:
-**      sabal2 v0.2d, v2.4s, v1.4s
+**      sabal2 v0.2d, v1.4s, v2.4s
 **      ret
 */
 int64x2_t
@@ -61,7 +61,7 @@ test_vabal_s32 (int64x2_t sadv, int32x4_t pv, int32x4_t sv)
 
 /*
 ** test_vabal_u32:
-**      uabal2 v0.2d, v2.4s, v1.4s
+**      uabal2 v0.2d, v1.4s, v2.4s
 **      ret
 */
 uint64x2_t
-- 
2.34.1

Reply via email to